[irstlm] 14/126: Added draft version of context-dependent Language model

Tue May 17 07:46:40 UTC 2016

This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 390a87b720e96f7e24ce943a7ece15cf1a88e759
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Tue Jul 21 11:27:42 2015 +0200

    Added draft version of context-dependent Language model
---
 src/lmContextDependent.cpp | 152 +++++++++++++++++++++++++++++++++++++++++++++
 src/lmContextDependent.h   | 141 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 293 insertions(+)

diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
new file mode 100644
index 0000000..d4004a4
--- /dev/null
+++ b/src/lmContextDependent.cpp
@@ -0,0 +1,152 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ 
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "lmContainer.h"
+#include "lmContextDependent.h"
+#include "util.h"
+
+using namespace std;
+	
+inline void error(const char* message)
+{
+  std::cerr << message << "\n";
+  throw std::runtime_error(message);
+}
+
+namespace irstlm {
+lmContextDependent::lmContextDependent(float nlf, float dlf)
+{
+  ngramcache_load_factor = nlf;
+  dictionary_load_factor = dlf;
+  m_lm=NULL;
+  m_topicmodel=NULL;
+	
+  order=0;
+  memmap=0;
+  isInverted=false;
+
+}
+
+lmContextDependent::~lmContextDependent()
+{
+  if (m_lm) delete m_lm;
+  if (m_topicmodel) delete m_topicmodel;
+}
+
+void lmContextDependent::load(const std::string &filename,int mmap)
+{
+  VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
+  VERBOSE(2," filename:|" << filename << "|" << std::endl);
+	
+	
+  dictionary_upperbound=1000000;
+  int memmap=mmap;
+	
+  //get info from the configuration file
+  fstream inp(filename.c_str(),ios::in|ios::binary);
+	
+  char line[MAX_LINE];
+  const char* words[LMCONFIGURE_MAX_TOKEN];
+  int tokenN;
+  inp.getline(line,MAX_LINE,'\n');
+  tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
+	
+  if (tokenN != 2 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
+    error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+	
+//reading ngram-based LM
+  inp.getline(line,BUFSIZ,'\n');
+  tokenN = parseWords(line,words,3);
+
+  if(tokenN < 2 || tokenN >3) {
+    error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+  }
+
+  //check whether the (textual) LM has to be loaded as inverted
+  m_isinverted = false;
+  if(tokenN == 3) {
+    if (strcmp(words[2],"inverted") == 0)
+      m_isinverted = true;
+  }
+  VERBOSE(2,"m_isinverted:" << m_isinverted << endl);
+
+  m_lm_weight = (float) atof(words[0]);
+
+  //checking the language model type
+  m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor,dictionary_load_factor);
+
+  //let know that table has inverted n-grams
+  m_lm->is_inverted(m_isinverted);  //set inverted flag for each LM
+
+  m_lm->setMaxLoadedLevel(requiredMaxlev);
+
+  m_lm->load(words[1], memmap);
+  dict=m_lm->getDict();
+  getDict()->genoovcode();
+
+  m_lm->init_caches(m_lm->maxlevel());
+
+
+//reading bigram-base topic model
+  inp.getline(line,BUFSIZ,'\n');
+  tokenN = parseWords(line,words,3);
+
+  if(tokenN < 2 || tokenN >3) {
+    error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+  }
+
+  //loading topic model and initialization
+  m_topicmodel_weight = (float) atof(words[0]);
+  //m_topic_model = new  xxxxxxxxxxxxxxxx
+
+
+  inp.close();
+}
+
+double lmContextDependent::lprob(int* codes, int sz, topic_map& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+{
+  //create the actual ngram
+  ngram ong(dict);
+  ong.pushc(codes,sz);
+  MY_ASSERT (ong.size == sz);
+	
+  double lm_prob = m_lm->clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+  double topic_prob = 0.0;  // to_CHECK
+  double ret_prob = m_lm_weight * lm_prob + m_topicmodel_weight * topic_prob;
+
+  return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+}
+
+double lmContextDependent::setlogOOVpenalty(int dub)
+{
+  MY_ASSERT(dub > dict->size());
+  m_lm->setlogOOVpenalty(dub);  //set OOV Penalty by means of DUB
+  double OOVpenalty = m_lm->getlogOOVpenalty();  //get OOV Penalty
+  logOOVpenalty=log(OOVpenalty);
+  return logOOVpenalty;
+}
+}//namespace irstlm
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
new file mode 100644
index 0000000..418fe70
--- /dev/null
+++ b/src/lmContextDependent.h
@@ -0,0 +1,141 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+******************************************************************************/
+
+#ifndef MF_LMCONTEXTDEPENDENT_H
+#define MF_LMCONTEXTDEPENDENT_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <math.h>
+#include <map>
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "lmContainer.h"
+
+typedef std::map< std::string, float > topic_map;
+	
+namespace irstlm {
+/*
+Context-dependent LM
+Wrapper LM which combines a standard ngram-based word-based LM
+and a bigram-based topic model 
+*/
+
+#define LMCONFIGURE_MAX_TOKEN 3
+
+class lmContextDependent: public lmContainer
+{
+  static const bool debug=true;
+  int order;
+  int dictionary_upperbound; //set by user
+  double  logOOVpenalty; //penalty for OOV words (default 0)
+  bool      isInverted;
+  int memmap;  //level from which n-grams are accessed via mmap
+
+  lmContainer* m_lm;
+  std::string m_lm_file;
+  bool m_isinverted;
+
+//  TopicModel* m_topicmodel;
+  lmContainer* m_topicmodel;   //to remove when TopicModel is ready
+  double m_lm_weight;
+
+  double m_topicmodel_weight;
+  std::string m_topicmodel_file;
+
+  float ngramcache_load_factor;
+  float dictionary_load_factor;
+
+  dictionary *dict; // dictionary for all interpolated LMs
+
+public:
+
+  lmContextDependent(float nlf=0.0, float dlfi=0.0);
+  virtual ~lmContextDependent();
+
+  void load(const std::string &filename,int mmap=0);
+
+  virtual double clprob(ngram ng,            double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+        VERBOSE(0, "This LM type (lmContextDependent) does not support this function");
+        assert(false);
+  };
+
+  virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+	VERBOSE(0, "This LM type (lmContextDependent) does not support this function");
+	assert(false);
+  };
+
+  virtual double clprob(int* ng, int ngsize, topic_map& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+	return lprob(ng, ngsize, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
+  };
+  virtual double lprob(int* ng, int ngsize, topic_map& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
+
+  int maxlevel() const {
+    return maxlev;
+  };
+
+  virtual inline void setDict(dictionary* d) {
+    if (dict) delete dict;
+    dict=d;
+  };
+	
+  virtual inline dictionary* getDict() const {
+    return dict;
+  };
+
+  //set penalty for OOV words
+  virtual inline double getlogOOVpenalty() const {
+    return logOOVpenalty;
+  }
+
+  virtual double setlogOOVpenalty(int dub);
+
+  double inline setlogOOVpenalty(double oovp) {
+    return logOOVpenalty=oovp;
+  }
+
+//set the inverted flag
+  inline bool is_inverted(const bool flag) {
+    return isInverted = flag;
+  }
+
+//for an interpolation LM this variable does not make sense
+//for compatibility, we return true if all subLM return true
+  inline bool is_inverted() {
+    return m_isinverted;
+  }
+
+  inline virtual void dictionary_incflag(const bool flag) {
+    dict->incflag(flag);
+  };
+
+  inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
+    return m_lm->is_OOV(code);
+  }
+};
+}//namespace irstlm
+
+#endif
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git