[irstlm] 28/126: lots of changes in function signature; new supporting functions; code cleanup
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:41 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 1d15fd6dfa723f5800a0f4fd944252b2e70a3e01
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Fri Jul 24 19:44:21 2015 +0200
lots of changes in function signature; new supporting functions; code cleanup
---
src/CMakeLists.txt | 3 +-
src/Makefile.am | 2 +
src/context-similarity.cpp | 142 +++++++++++++++++++++++++++++++++++++++++++++
src/context-similarity.h | 72 +++++++++++++++++++++++
src/lmContainer.h | 3 +-
src/lmContextDependent.cpp | 36 ++++++------
src/lmContextDependent.h | 9 +--
src/util.cpp | 11 ++++
src/util.h | 4 ++
9 files changed, 256 insertions(+), 26 deletions(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f81c525..36c3926 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -34,7 +34,8 @@ SET( LIB_IRSTLM_SRC
lmmacro.h lmmacro.cpp
lmtable.h lmtable.cpp
lmInterpolation.h lmInterpolation.cpp
- lmContextDependent.h lmContextDependent.h.cpp
+ lmContextDependent.h lmContextDependent.cpp
+ context-similarity.h context-similarity.cpp
mempool.h mempool.cpp
mfstream.h mfstream.cpp
n_gram.h n_gram.cpp
diff --git a/src/Makefile.am b/src/Makefile.am
index 1595788..63e2107 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -17,6 +17,7 @@ libirstlm_la_HEADERS = \
lmtable.h \
lmInterpolation.h \
lmContextDependent.h \
+ context-similarity.h \
mempool.h \
mfstream.h \
n_gram.h \
@@ -46,6 +47,7 @@ libirstlm_la_SOURCES = \
lmtable.cpp \
lmInterpolation.cpp \
lmContextDependent.cpp \
+ context-similarity.cpp \
mempool.cpp \
mfstream.cpp \
n_gram.cpp \
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
new file mode 100644
index 0000000..9a7a7ca
--- /dev/null
+++ b/src/context-similarity.cpp
@@ -0,0 +1,142 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "lmContainer.h"
+#include "context-similarity.h"
+#include "util.h"
+
+using namespace std;
+
+inline void error(const char* message)
+{
+ std::cerr << message << "\n";
+ throw std::runtime_error(message);
+}
+
+namespace irstlm {
+ ContextSimilarity::ContextSimilarity(const std::string &filename)
+ {
+ m_lm=lmContainer::CreateLanguageModel(filename);
+
+ m_lm->load(filename);
+
+ m_lm->getDict()->genoovcode();
+ }
+
+ ContextSimilarity::~ContextSimilarity()
+ {
+ // delete m_lm
+ }
+
+ double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
+ {
+ if (topic_weights.size() == 0){
+ //a-priori topic distribution is "empty", i.e. there is nore score for any topic
+ //return a "constant" lower-bound score, SIMILARITY_LOWER_BOUND = log(0.0)
+ return SIMILARITY_LOWER_BOUND;
+ }
+
+ ngram base_num_ng(m_lm->getDict());
+ ngram base_den_ng(m_lm->getDict());
+ create_ngram(text, base_num_ng, base_den_ng);
+
+ double ret_logprob = 0.0;
+ double add_logprob;
+ topic_map_t::iterator it = topic_weights.begin();
+ do
+ {
+ ngram num_ng = base_num_ng;
+ ngram den_ng = base_den_ng;
+ add_topic(it->first, num_ng, den_ng);
+ add_logprob = log(it->second) + get_topic_similarity(num_ng, den_ng);
+ ret_logprob = logsum(ret_logprob, add_logprob);
+ ++it;
+ }while (it!= topic_weights.end());
+
+ return ret_logprob;
+ }
+
+
+ topic_map_t ContextSimilarity::get_topic_scores(string_vec_t& text)
+ {
+ topic_map_t topic_map;
+
+ ngram base_num_ng(m_lm->getDict());
+ ngram base_den_ng(m_lm->getDict());
+ create_ngram(text, base_num_ng, base_den_ng);
+
+ for (topic_dict_t::iterator it=m_lm_topic_dict->begin(); it != m_lm_topic_dict->end(); ++it)
+ {
+ ngram num_ng = base_num_ng;
+ ngram den_ng = base_den_ng;
+ add_topic(*it, num_ng, den_ng);
+ topic_map[*it] = get_topic_similarity(num_ng, den_ng);
+ }
+ return topic_map;
+ }
+
+ void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
+ {
+ //text is a vector of string with w in the last position and the history in the previous positions
+ //text must have at least two words
+ num_ng.pushw(text.at(text.size()-2));
+ num_ng.pushw(text.at(text.size()-1));
+
+ den_ng.pushw(den_ng.dict->OOV()); //or den_ng.pushc(m_lm->getDict()->getoovcode());
+ den_ng.pushw(text.at(text.size()-1));
+ }
+
+ void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng)
+ {
+ //text is a vector of string with w in the last position and the history in the previous positions
+ //text must have at least two words
+ create_ngram(text, num_ng, den_ng);
+ add_topic(topic, num_ng, den_ng);
+ }
+
+ void ContextSimilarity::add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng)
+ {
+ num_ng.pushw(topic);
+ den_ng.pushw(topic);
+ }
+
+ double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
+ {
+ ngram num_ng(m_lm->getDict());
+ ngram den_ng(m_lm->getDict());
+
+ create_topic_ngram(text, topic, num_ng, den_ng);
+
+ return get_topic_similarity(num_ng, den_ng);
+ }
+
+ double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
+ {
+ return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
+ }
+
+}//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
new file mode 100644
index 0000000..16fbdf3
--- /dev/null
+++ b/src/context-similarity.h
@@ -0,0 +1,72 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+
+#ifndef MF_CONTEXTSIMILARITY_H
+#define MF_CONTEXTSIMILARITY_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <math.h>
+#include <set>
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "lmContainer.h"
+
+class ngram;
+
+namespace irstlm {
+
+
+ typedef std::map< std::string, float > topic_map_t;
+ typedef std::set< std::string > topic_dict_t;
+
+ #define SIMILARITY_LOWER_BOUND -10000
+ class ContextSimilarity
+ {
+ private:
+ lmContainer* m_lm; // P(topic | h' w)
+ topic_dict_t* m_lm_topic_dict; //the dictionary of the topics seen in the language model
+ topic_map_t* topic_map;
+
+ void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
+ void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
+ void create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng);
+
+ double get_topic_similarity(string_vec_t text, const std::string& topic);
+ double get_topic_similarity(ngram& num_ng, ngram& den_ng);
+
+ public:
+ ContextSimilarity(const std::string &filename);
+ ~ContextSimilarity();
+
+ topic_map_t get_topic_scores(string_vec_t& text);
+
+ double score(string_vec_t& text, topic_map_t& topic_weights);
+ };
+}
+
+
+#endif
+
diff --git a/src/lmContainer.h b/src/lmContainer.h
index eb1fca6..5f760ff 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -42,9 +42,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE;
namespace irstlm {
+ typedef std::map< std::string, float > topic_map_t;
-typedef std::map< std::string, float > topic_map_t;
-
class lmContainer
{
static const bool debug=true;
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 2639ace..5fab621 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -43,7 +43,7 @@ namespace irstlm {
ngramcache_load_factor = nlf;
dictionary_load_factor = dlf;
m_lm=NULL;
- m_topicmodel=NULL;
+ m_similaritymodel=NULL;
order=0;
memmap=0;
@@ -54,7 +54,7 @@ namespace irstlm {
lmContextDependent::~lmContextDependent()
{
if (m_lm) delete m_lm;
- if (m_topicmodel) delete m_topicmodel;
+ if (m_similaritymodel) delete m_similaritymodel;
}
void lmContextDependent::load(const std::string &filename,int mmap)
@@ -88,7 +88,7 @@ namespace irstlm {
m_lm_weight = (float) atof(words[0]);
//checking the language model type
- m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor,dictionary_load_factor);
+ m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor, dictionary_load_factor);
m_lm->setMaxLoadedLevel(requiredMaxlev);
@@ -108,9 +108,8 @@ namespace irstlm {
}
//loading topic model and initialization
- m_topicmodel_weight = (float) atof(words[0]);
- m_topicmodel = new PseudoTopicModel();
- m_topicmodel->load(words[1]);
+ m_similaritymodel_weight = (float) atof(words[0]);
+ m_similaritymodel = new ContextSimilarity(words[1]);
inp.close();
}
@@ -118,12 +117,12 @@ namespace irstlm {
double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
string_vec_t text; // replace with the text passed as parameter
- double lm_prob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
- double topic_prob = m_topicmodel->prob(text, topic_weights);
- double ret_prob = m_lm_weight * lm_prob + m_topicmodel_weight * topic_prob;
- VERBOSE(0, "lm_prob:" << lm_prob << " m_lm_weight:" << m_lm_weight << " topic_prob:" << topic_prob << " m_topicmodel_weight:" << m_topicmodel_weight << " ret_prob:" << ret_prob << std::endl);
+ double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+ double similarity_score = m_similaritymodel->score(text, topic_weights);
+ double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
- return ret_prob;
+ return ret_logprob;
}
double lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -132,15 +131,13 @@ namespace irstlm {
//create the actual ngram
ngram ng(dict);
ng.pushw(text);
- MY_ASSERT (ng.size == text.size());
+ MY_ASSERT (ng.size == (int) text.size());
+ double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+ double similarity_score = m_similaritymodel->score(text, topic_weights);
+ double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
- double lm_prob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
- double topic_prob = m_topicmodel->prob(text, topic_weights);
-
- double ret_prob = m_lm_weight * lm_prob + m_topicmodel_weight * topic_prob;
- VERBOSE(0, "lm_prob:" << lm_prob << " m_lm_weight:" << m_lm_weight << " topic_prob:" << topic_prob << " m_topicmodel_weight:" << m_topicmodel_weight << " ret_prob:" << ret_prob << std::endl);
-
- return ret_prob;
+ return ret_logprob;
}
double lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -153,6 +150,7 @@ namespace irstlm {
return lprob(ong, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
}
+
double lmContextDependent::setlogOOVpenalty(int dub)
{
MY_ASSERT(dub > dict->size());
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index d3d1689..efecec6 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -32,6 +32,7 @@
#include "dictionary.h"
#include "n_gram.h"
#include "lmContainer.h"
+#include "context-similarity.h"
namespace irstlm {
class PseudoTopicModel
@@ -72,15 +73,15 @@ namespace irstlm {
int memmap; //level from which n-grams are accessed via mmap
lmContainer* m_lm;
- std::string m_lm_file;
+// std::string m_lm_file;
bool m_isinverted;
// TopicModel* m_topicmodel;
- PseudoTopicModel* m_topicmodel; //to remove when TopicModel is ready
+ ContextSimilarity* m_similaritymodel; //to remove when TopicModel is ready
double m_lm_weight;
- double m_topicmodel_weight;
- std::string m_topicmodel_file;
+ double m_similaritymodel_weight;
+// std::string m_similaritymodel_file;
float ngramcache_load_factor;
float dictionary_load_factor;
diff --git a/src/util.cpp b/src/util.cpp
index 6db27c4..b25f3ea 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -375,3 +375,14 @@ string_vec_t &split(const std::string &s, char delim, string_vec_t &elems) {
return elems;
}
+float logsum(float a,float b){
+ if (b<a) return a + logf(1 + expf(b-a));
+ else return b + logf(1+ expf(a-b));
+}
+
+double logsum(double a,double b){
+ if (b<a) return a + log(1 + exp(b-a));
+ else return b + log(1+ exp(a-b));
+}
+
+
diff --git a/src/util.h b/src/util.h
index fbb9bbf..694a485 100644
--- a/src/util.h
+++ b/src/util.h
@@ -9,6 +9,7 @@
#include <iostream>
#include <fstream>
#include <assert.h>
+#include <math.h>
using namespace std;
@@ -102,5 +103,8 @@ extern const int tracelevel;
void MY_ASSERT(bool x);
+float logsum(float a,float b);
+double logsum(double a,double b);
+
#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list