[irstlm] 27/78: missing files
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit 895f2325c946a7e83c12cfb3087ea869d9bef9a5
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Mon Nov 9 09:54:09 2015 +0100
missing files
---
src/lmContextDependent.cpp | 247 +++++++++++++++++++++++++++++++++++++++++++++
src/lmContextDependent.h | 199 ++++++++++++++++++++++++++++++++++++
2 files changed, 446 insertions(+)
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
new file mode 100644
index 0000000..39b0c95
--- /dev/null
+++ b/src/lmContextDependent.cpp
@@ -0,0 +1,247 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "lmContainer.h"
+#include "lmContextDependent.h"
+#include "util.h"
+
+using namespace std;
+
+inline void error(const char* message)
+{
+ std::cerr << message << "\n";
+ throw std::runtime_error(message);
+}
+
+namespace irstlm {
+
+ lmContextDependent::lmContextDependent(float nlf, float dlf)
+ {
+ ngramcache_load_factor = nlf;
+ dictionary_load_factor = dlf;
+ m_lm=NULL;
+ m_similaritymodel=NULL;
+
+ order=0;
+ memmap=0;
+ isInverted=false;
+ m_isadaptive=true;
+
+
+ VERBOSE(2,"lmContextDependent::lmContextDependent(const std::string &filename,int mmap) isadaptive:|" << m_isadaptive << "|" << std::endl);
+ }
+
+ lmContextDependent::~lmContextDependent()
+ {
+ if (m_lm) delete m_lm;
+ if (m_similaritymodel) delete m_similaritymodel;
+ }
+
+ void lmContextDependent::load(const std::string &filename,int mmap)
+ {
+ VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
+ VERBOSE(2,"configuration file:|" << filename << "|" << std::endl);
+
+ dictionary_upperbound=1000000;
+ int memmap=mmap;
+
+ //get info from the configuration file
+ fstream inp(filename.c_str(),ios::in|ios::binary);
+ VERBOSE(0, "filename:|" << filename << "|" << std::endl);
+
+ char line[MAX_LINE];
+ const char* words[LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN];
+ int tokenN;
+ inp.getline(line,MAX_LINE,'\n');
+ tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
+
+ if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+
+ //reading ngram-based LM
+ inp.getline(line,BUFSIZ,'\n');
+ tokenN = parseWords(line,words,1);
+ if(tokenN < 1 || tokenN > 1) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+ }
+
+ VERBOSE(0, "model_w:|" << words[0] << "|" << std::endl);
+ //checking the language model type
+ m_lm=lmContainer::CreateLanguageModel(words[0],ngramcache_load_factor, dictionary_load_factor);
+
+ m_lm->setMaxLoadedLevel(requiredMaxlev);
+
+ m_lm->load(words[0], memmap);
+ maxlev=m_lm->maxlevel();
+ dict=m_lm->getDict();
+ getDict()->genoovcode();
+
+ m_lm->init_caches(m_lm->maxlevel());
+
+ //reading topic model
+ inp.getline(line,BUFSIZ,'\n');
+ tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
+
+ if(tokenN < 5 || tokenN > LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+ }
+
+ //loading topic model and initialization
+ m_similaritymodel_weight = (float) atof(words[0]);
+ std::string _k_ngt = words[1];
+ std::string _hk_ngt = words[2];
+ std::string _hwk_ngt = words[3];
+ int _thr = atoi(words[4]);
+ double _smoothing = 0.1;
+ if (tokenN == 6){ _smoothing = atof(words[5]); }
+ m_similaritymodel = new ContextSimilarity(_k_ngt, _hk_ngt, _hwk_ngt);
+ m_similaritymodel->set_Threshold_on_H(_thr);
+ m_similaritymodel->set_SmoothingValue(_smoothing);
+
+ inp.close();
+
+ VERBOSE(0, "model_k:|" << _k_ngt << "|" << std::endl);
+ VERBOSE(0, "model_hk:|" << _hk_ngt << "|" << std::endl);
+ VERBOSE(0, "model_hwk:|" << _hwk_ngt << "|" << std::endl);
+ VERBOSE(0, "topic_threshold_on_h:|" << m_similaritymodel->get_Threshold_on_H() << "|" << std::endl);
+ VERBOSE(0, "shift-beta smoothing on counts:|" << m_similaritymodel->get_SmoothingValue() << "|" << std::endl);
+ }
+
+ double lmContextDependent::lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ VERBOSE(2,"lmContextDependent::lprob(ngram ng, ...)" << std::endl);
+ string_vec_t text;
+ if (ng.size>1){
+ text.push_back(ng.dict->decode(*ng.wordp(2)));
+ }
+ text.push_back(ng.dict->decode(*ng.wordp(1)));
+
+ return lprob(ng, text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+
+ double lmContextDependent::lprob(string_vec_t& text, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ VERBOSE(2,"lmContextDependent::lprob(string_vec_t& text, ...)" << std::endl);
+
+ //create the actual ngram
+ ngram ng(dict);
+ ng.pushw(text);
+ VERBOSE(3,"ng:|" << ng << "|" << std::endl);
+
+ MY_ASSERT (ng.size == (int) text.size());
+ return lprob(ng, text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+
+ double lmContextDependent::lprob(ngram& ng, string_vec_t& text, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ UNUSED(text);
+ VERBOSE(2,"lmContextDependent::lprob(ngram& ng, string_vec_t& text, ...)" << std::endl);
+ double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ double ret_logprob = lm_logprob;
+ VERBOSE(2, "lm_log10_pr:" << lm_logprob << " similarity_score:_undef_ m_similaritymodel_weight:_undef_ ret_log10_pr:" << ret_logprob << std::endl);
+
+ return ret_logprob;
+ }
+
+ double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ VERBOSE(2,"lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, ...)" << std::endl);
+ string_vec_t text;
+ if (ng.size>1){
+ text.push_back(ng.dict->decode(*ng.wordp(2)));
+ }
+ text.push_back(ng.dict->decode(*ng.wordp(1)));
+
+ return lprob(ng, text, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+
+ double lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ VERBOSE(2,"lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, ...)" << std::endl);
+
+ //create the actual ngram
+ ngram ng(dict);
+ ng.pushw(text);
+ VERBOSE(3,"ng:|" << ng << "|" << std::endl);
+
+ MY_ASSERT (ng.size == (int) text.size());
+ return lprob(ng, text, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+
+ double lmContextDependent::lprob(ngram& ng, string_vec_t& text, topic_map_t& topic_weights, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ VERBOSE(2,"lmContextDependent::lprob(ngram& ng, string_vec_t& text, topic_map_t& topic_weights, ...)" << std::endl);
+ // double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+ double lm_logprob = m_lm->clprob(ng, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ double similarity_score = m_similaritymodel->context_similarity(text, topic_weights);
+ double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(2, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
+
+ return ret_logprob;
+ }
+
+
+ double lmContextDependent::total_clprob(string_vec_t& text, topic_map_t& topic_weights)
+ {
+ VERBOSE(2,"lmContextDependent::total_lprob(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+ double tot_pr = 0.0;
+ double v_pr;
+ for (int v=0; v<dict->size(); ++v){
+ //replace last word, which is in position 2, keeping topic in position 1 unchanged
+ text.at(text.size()-1) = dict->decode(v);
+ v_pr = clprob(text, topic_weights);
+ tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+ }
+ return log10(tot_pr);
+ }
+
+ double lmContextDependent::total_clprob(ngram& ng, topic_map_t& topic_weights)
+ {
+ VERBOSE(2,"lmContextDependent::total_lprob(ngram& ng, topic_map_t& topic_weights)" << std::endl);
+ double tot_pr = 0.0;
+ double v_pr;
+ double oovpenalty = getlogOOVpenalty();
+ setlogOOVpenalty((double) 0);
+ for (int v=0; v<dict->size(); ++v){
+ //replace last word, which is in position 2, keeping topic in position 1 unchanged
+ *ng.wordp(1) = ng.dict->encode(dict->decode(v));
+ v_pr = clprob(ng, topic_weights);
+ tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+ }
+ setlogOOVpenalty(oovpenalty);
+ return log10(tot_pr);
+ }
+
+ double lmContextDependent::setlogOOVpenalty(int dub)
+ {
+ MY_ASSERT(dub > dict->size());
+ m_lm->setlogOOVpenalty(dub); //set OOV Penalty by means of DUB
+ logOOVpenalty=log(m_lm->getlogOOVpenalty());
+ return logOOVpenalty;
+ }
+
+}//namespace irstlm
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
new file mode 100644
index 0000000..4d9c515
--- /dev/null
+++ b/src/lmContextDependent.h
@@ -0,0 +1,199 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+
+#ifndef MF_LMCONTEXTDEPENDENT_H
+#define MF_LMCONTEXTDEPENDENT_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "lmContainer.h"
+#include "context-similarity.h"
+
+namespace irstlm {
+ class PseudoTopicModel
+ {
+ public:
+ PseudoTopicModel(){};
+ ~PseudoTopicModel(){};
+
+ void load(const std::string &filename){
+ UNUSED(filename);
+ };
+
+ double prob(string_vec_t& text, topic_map_t& topic_weights){
+ UNUSED(text);
+ UNUSED(topic_weights);
+ return 1.0;
+ }
+ };
+}
+
+namespace irstlm {
+ /*
+ Context-dependent LM
+ Wrapper LM which combines a standard ngram-based word-based LM
+ and a bigram-based topic model
+ */
+
+#define LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN 6
+
+ class lmContextDependent: public lmContainer
+ {
+ private:
+ static const bool debug=true;
+ int order;
+ int dictionary_upperbound; //set by user
+ double logOOVpenalty; //penalty for OOV words (default 0)
+ bool isInverted;
+ int memmap; //level from which n-grams are accessed via mmap
+
+ lmContainer* m_lm;
+ bool m_isinverted;
+
+
+ //flag for enabling/disabling normalization of the language model
+ // if disabled, score returns by the language model do not sum to 1.0
+ bool m_normalization;
+
+ ContextSimilarity* m_similaritymodel; //to remove when TopicModel is ready
+ double m_lm_weight;
+
+ double m_similaritymodel_weight;
+
+ float ngramcache_load_factor;
+ float dictionary_load_factor;
+
+ dictionary *dict; // dictionary for all interpolated LMs
+
+ public:
+
+ lmContextDependent(float nlf=0.0, float dlfi=0.0);
+ virtual ~lmContextDependent();
+
+ void load(const std::string &filename,int mmap=0);
+
+ inline std::string getLexiconDelimiter() const{
+ return lexicon_delimiter;
+ }
+
+ virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL){ return lprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); };
+ virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL){ return lprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); };
+ virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL){ return lprob(ng, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); };
+ virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL){ return lprob(text, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); };
+
+ virtual double lprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+ virtual double lprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+ virtual double lprob(ngram& ng, string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+
+ virtual double lprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+ virtual double lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+ virtual double lprob(ngram& ng, string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+
+ double total_clprob(string_vec_t& text, topic_map_t& topic_weights);
+ double total_clprob(ngram& ng, topic_map_t& topic_weights);
+
+
+
+ virtual inline int get(ngram& ng) {
+ return m_lm->get(ng);
+ }
+
+ virtual int get(ngram& ng,int n,int lev){
+ return m_lm->get(ng,n,lev);
+ }
+
+ virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
+ return m_lm->succscan(h,ng,action,lev);
+ }
+ /*
+ int maxlevel() const {
+ return maxlev;
+ };
+ */
+
+ virtual inline void setDict(dictionary* d) {
+ if (dict) delete dict;
+ dict=d;
+ };
+
+ virtual inline lmContainer* getWordLM() const {
+ return m_lm;
+ };
+
+ virtual inline ContextSimilarity* getContextSimilarity() const {
+ return m_similaritymodel;
+ };
+
+ virtual inline dictionary* getDict() const {
+ return dict;
+ };
+
+ //set penalty for OOV words
+ virtual inline double getlogOOVpenalty() const {
+ return logOOVpenalty;
+ }
+
+ virtual double setlogOOVpenalty(int dub);
+
+ double inline setlogOOVpenalty(double oovp) {
+ return logOOVpenalty=oovp;
+ }
+
+ //set the inverted flag
+ inline bool is_inverted(const bool flag) {
+ return isInverted = flag;
+ }
+
+ //for an interpolation LM this variable does not make sense
+ //for compatibility, we return true if all subLM return true
+ inline bool is_inverted() {
+ return m_isinverted;
+ }
+
+ inline virtual void dictionary_incflag(const bool flag) {
+ dict->incflag(flag);
+ };
+
+ inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
+ return m_lm->is_OOV(code);
+ }
+
+ inline void set_Active(bool value){
+ m_similaritymodel->set_Active(value);
+ }
+
+ bool is_Normalized(){
+ return m_normalization;
+ }
+ void set_Normalized(bool val){
+ m_normalization = val;
+ }
+
+ };
+}//namespace irstlm
+
+#endif
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list