[irstlm] 28/126: lots of changes in function signature; new supporting functions; code cleanup

Tue May 17 07:46:41 UTC 2016

This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 1d15fd6dfa723f5800a0f4fd944252b2e70a3e01
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Fri Jul 24 19:44:21 2015 +0200

    lots of changes in function signature; new supporting functions; code cleanup
---
 src/CMakeLists.txt         |   3 +-
 src/Makefile.am            |   2 +
 src/context-similarity.cpp | 142 +++++++++++++++++++++++++++++++++++++++++++++
 src/context-similarity.h   |  72 +++++++++++++++++++++++
 src/lmContainer.h          |   3 +-
 src/lmContextDependent.cpp |  36 ++++++------
 src/lmContextDependent.h   |   9 +--
 src/util.cpp               |  11 ++++
 src/util.h                 |   4 ++
 9 files changed, 256 insertions(+), 26 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f81c525..36c3926 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -34,7 +34,8 @@ SET( LIB_IRSTLM_SRC
         lmmacro.h lmmacro.cpp
         lmtable.h lmtable.cpp
         lmInterpolation.h lmInterpolation.cpp
-        lmContextDependent.h lmContextDependent.h.cpp
+        lmContextDependent.h lmContextDependent.cpp
+        context-similarity.h context-similarity.cpp
         mempool.h mempool.cpp 
         mfstream.h mfstream.cpp 
         n_gram.h n_gram.cpp 
diff --git a/src/Makefile.am b/src/Makefile.am
index 1595788..63e2107 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -17,6 +17,7 @@ libirstlm_la_HEADERS = \
   lmtable.h \
   lmInterpolation.h \
   lmContextDependent.h \
+  context-similarity.h \
   mempool.h \
   mfstream.h \
   n_gram.h \
@@ -46,6 +47,7 @@ libirstlm_la_SOURCES = \
   lmtable.cpp \
   lmInterpolation.cpp \
   lmContextDependent.cpp \
+  context-similarity.cpp \
   mempool.cpp \
   mfstream.cpp \
   n_gram.cpp \
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
new file mode 100644
index 0000000..9a7a7ca
--- /dev/null
+++ b/src/context-similarity.cpp
@@ -0,0 +1,142 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ 
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "lmContainer.h"
+#include "context-similarity.h"
+#include "util.h"
+
+using namespace std;
+
+inline void error(const char* message)
+{
+  std::cerr << message << "\n";
+  throw std::runtime_error(message);
+}
+
+namespace irstlm {
+	ContextSimilarity::ContextSimilarity(const std::string &filename)
+	{
+		m_lm=lmContainer::CreateLanguageModel(filename);
+		
+		m_lm->load(filename);
+		
+		m_lm->getDict()->genoovcode();
+	}
+	
+	ContextSimilarity::~ContextSimilarity()
+	{
+		// delete m_lm
+	}
+	
+	double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
+	{
+		if (topic_weights.size() == 0){
+			//a-priori topic distribution is "empty", i.e. there is nore score for any topic
+			//return a "constant" lower-bound score,  SIMILARITY_LOWER_BOUND = log(0.0)
+			return SIMILARITY_LOWER_BOUND;
+		}
+		
+		ngram base_num_ng(m_lm->getDict());
+		ngram base_den_ng(m_lm->getDict());
+		create_ngram(text, base_num_ng, base_den_ng);
+		
+		double ret_logprob = 0.0;
+		double add_logprob;
+		topic_map_t::iterator it = topic_weights.begin();
+		do
+		{
+			ngram num_ng = base_num_ng;
+			ngram den_ng = base_den_ng;
+			add_topic(it->first, num_ng, den_ng);
+			add_logprob = log(it->second) + get_topic_similarity(num_ng, den_ng);
+			ret_logprob = logsum(ret_logprob, add_logprob);
+			++it;
+		}while (it!= topic_weights.end());
+		
+		return ret_logprob;
+	}
+	
+	
+	topic_map_t ContextSimilarity::get_topic_scores(string_vec_t& text)
+	{
+		topic_map_t topic_map;
+		
+		ngram base_num_ng(m_lm->getDict());
+		ngram base_den_ng(m_lm->getDict());
+		create_ngram(text, base_num_ng, base_den_ng);
+		
+		for (topic_dict_t::iterator it=m_lm_topic_dict->begin(); it != m_lm_topic_dict->end(); ++it)
+		{
+			ngram num_ng = base_num_ng;
+			ngram den_ng = base_den_ng;
+			add_topic(*it, num_ng, den_ng);
+			topic_map[*it] = get_topic_similarity(num_ng, den_ng);
+		}
+		return topic_map;
+	}
+	
+	void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
+	{
+		//text is  a vector of string with w in the last position and the history in the previous positions
+		//text must have at least two words
+		num_ng.pushw(text.at(text.size()-2));
+		num_ng.pushw(text.at(text.size()-1));
+		
+		den_ng.pushw(den_ng.dict->OOV());		//or den_ng.pushc(m_lm->getDict()->getoovcode());
+		den_ng.pushw(text.at(text.size()-1));
+	}
+	
+	void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng)
+	{
+		//text is  a vector of string with w in the last position and the history in the previous positions
+		//text must have at least two words
+		create_ngram(text, num_ng, den_ng);
+		add_topic(topic, num_ng, den_ng);
+	}
+	
+	void ContextSimilarity::add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng)
+	{		
+		num_ng.pushw(topic);
+		den_ng.pushw(topic);
+	}
+	
+	double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
+	{
+		ngram num_ng(m_lm->getDict());
+		ngram den_ng(m_lm->getDict());
+		
+		create_topic_ngram(text, topic, num_ng, den_ng);
+		
+		return get_topic_similarity(num_ng, den_ng);
+	}
+	
+	double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
+	{
+		return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
+	}
+	
+}//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
new file mode 100644
index 0000000..16fbdf3
--- /dev/null
+++ b/src/context-similarity.h
@@ -0,0 +1,72 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ 
+ ******************************************************************************/
+
+#ifndef MF_CONTEXTSIMILARITY_H
+#define MF_CONTEXTSIMILARITY_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <math.h>
+#include <set>
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "lmContainer.h"
+
+class ngram;
+
+namespace irstlm {
+	
+	
+	typedef std::map< std::string, float > topic_map_t;
+	typedef std::set< std::string > topic_dict_t;
+	
+	#define SIMILARITY_LOWER_BOUND -10000
+	class ContextSimilarity
+	{
+	private:
+		lmContainer* m_lm; // P(topic | h' w)
+		topic_dict_t* m_lm_topic_dict; //the dictionary of the topics seen in the language model
+		topic_map_t* topic_map; 
+		
+		void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
+		void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
+		void create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng);
+		
+		double get_topic_similarity(string_vec_t text, const std::string& topic);
+		double get_topic_similarity(ngram& num_ng, ngram& den_ng);
+		
+	public:
+		ContextSimilarity(const std::string &filename);
+		~ContextSimilarity();
+
+		topic_map_t get_topic_scores(string_vec_t& text);
+		
+		double score(string_vec_t& text, topic_map_t& topic_weights);
+	};
+}
+
+
+#endif
+
diff --git a/src/lmContainer.h b/src/lmContainer.h
index eb1fca6..5f760ff 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -42,9 +42,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
 typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE;
 
 namespace irstlm {
+	typedef std::map< std::string, float > topic_map_t;
 	
-typedef std::map< std::string, float > topic_map_t;
-
 class lmContainer
 {
   static const bool debug=true;
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 2639ace..5fab621 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -43,7 +43,7 @@ namespace irstlm {
 		ngramcache_load_factor = nlf;
 		dictionary_load_factor = dlf;
 		m_lm=NULL;
-		m_topicmodel=NULL;
+		m_similaritymodel=NULL;
 		
 		order=0;
 		memmap=0;
@@ -54,7 +54,7 @@ namespace irstlm {
 	lmContextDependent::~lmContextDependent()
 	{
 		if (m_lm) delete m_lm;
-		if (m_topicmodel) delete m_topicmodel;
+		if (m_similaritymodel) delete m_similaritymodel;
 	}
 	
 	void lmContextDependent::load(const std::string &filename,int mmap)
@@ -88,7 +88,7 @@ namespace irstlm {
 		m_lm_weight = (float) atof(words[0]);
 		
 		//checking the language model type
-		m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor,dictionary_load_factor);
+		m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor, dictionary_load_factor);
 		
 		m_lm->setMaxLoadedLevel(requiredMaxlev);
 		
@@ -108,9 +108,8 @@ namespace irstlm {
 		}
 		
 		//loading topic model and initialization
-		m_topicmodel_weight = (float) atof(words[0]);
-		m_topicmodel = new  PseudoTopicModel();
-		m_topicmodel->load(words[1]);
+		m_similaritymodel_weight = (float) atof(words[0]);
+		m_similaritymodel = new ContextSimilarity(words[1]);
 		
 		inp.close();
 	}
@@ -118,12 +117,12 @@ namespace irstlm {
 	double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
 		string_vec_t text;   // replace with the text passed as parameter
-		double lm_prob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
-		double topic_prob = m_topicmodel->prob(text, topic_weights);
-		double ret_prob = m_lm_weight * lm_prob + m_topicmodel_weight * topic_prob;
-		VERBOSE(0, "lm_prob:" << lm_prob << " m_lm_weight:" << m_lm_weight << " topic_prob:" << topic_prob << " m_topicmodel_weight:" << m_topicmodel_weight << " ret_prob:" << ret_prob << std::endl);
+		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+		double similarity_score = m_similaritymodel->score(text, topic_weights);
+		double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
+		VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
 		
-		return ret_prob;
+		return ret_logprob;
 	}
 	
 	double lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -132,15 +131,13 @@ namespace irstlm {
 		//create the actual ngram
 		ngram ng(dict);
 		ng.pushw(text);
-		MY_ASSERT (ng.size == text.size());
+		MY_ASSERT (ng.size == (int) text.size());
+		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+		double similarity_score = m_similaritymodel->score(text, topic_weights);
+		double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
+		VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
 		
-		double lm_prob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
-		double topic_prob = m_topicmodel->prob(text, topic_weights);
-		
-		double ret_prob = m_lm_weight * lm_prob + m_topicmodel_weight * topic_prob;
-		VERBOSE(0, "lm_prob:" << lm_prob << " m_lm_weight:" << m_lm_weight << " topic_prob:" << topic_prob << " m_topicmodel_weight:" << m_topicmodel_weight << " ret_prob:" << ret_prob << std::endl);
-		
-		return ret_prob;	
+		return ret_logprob;
 	}
 	
 	double lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -153,6 +150,7 @@ namespace irstlm {
 		
 		return lprob(ong, topic_weights, bow, bol, maxsuffptr, statesize, extendible);	
 	}
+	
 	double lmContextDependent::setlogOOVpenalty(int dub)
 	{
 		MY_ASSERT(dub > dict->size());
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index d3d1689..efecec6 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -32,6 +32,7 @@
 #include "dictionary.h"
 #include "n_gram.h"
 #include "lmContainer.h"
+#include "context-similarity.h"
 
 namespace irstlm {
 	class PseudoTopicModel
@@ -72,15 +73,15 @@ namespace irstlm {
 		int memmap;  //level from which n-grams are accessed via mmap
 		
 		lmContainer* m_lm;
-		std::string m_lm_file;
+//		std::string m_lm_file;
 		bool m_isinverted;
 		
 		//  TopicModel* m_topicmodel;
-		PseudoTopicModel* m_topicmodel;   //to remove when TopicModel is ready
+		ContextSimilarity* m_similaritymodel;   //to remove when TopicModel is ready
 		double m_lm_weight;
 		
-		double m_topicmodel_weight;
-		std::string m_topicmodel_file;
+		double m_similaritymodel_weight;
+//		std::string m_similaritymodel_file;
 		
 		float ngramcache_load_factor;
 		float dictionary_load_factor;
diff --git a/src/util.cpp b/src/util.cpp
index 6db27c4..b25f3ea 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -375,3 +375,14 @@ string_vec_t &split(const std::string &s, char delim, string_vec_t &elems) {
 	return elems;
 }
 
+float logsum(float a,float b){
+	if (b<a) return a + logf(1 + expf(b-a));
+	else return b + logf(1+ expf(a-b));
+}
+
+double logsum(double a,double b){
+	if (b<a) return a + log(1 + exp(b-a));
+	else return b + log(1+ exp(a-b));
+}
+
+
diff --git a/src/util.h b/src/util.h
index fbb9bbf..694a485 100644
--- a/src/util.h
+++ b/src/util.h
@@ -9,6 +9,7 @@
 #include <iostream>
 #include <fstream>
 #include <assert.h>
+#include <math.h>
 
 using namespace std;
 
@@ -102,5 +103,8 @@ extern const int tracelevel;
 
 void MY_ASSERT(bool x);
 
+float logsum(float a,float b);
+double logsum(double a,double b);
+
 #endif
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git