[irstlm] 14/78: added functions to handle with context weights

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:01 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.

commit ed6d2bc460e08028fa5de93a1b5b1e874babb2bf
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Sun Nov 8 18:38:19 2015 +0100

    added functions to handle with context weights
---
 src/lmContainer.cpp | 42 ++++++++++++++++++++++++++++++++++++++
 src/lmContainer.h   | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 src/lmtable.h       |  1 -
 src/util.cpp        | 10 +++++++++
 src/util.h          | 13 ++++++++++--
 5 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index bde6996..afdd77c 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -172,4 +172,46 @@ namespace irstlm {
 		return false;
 	};
 	
+	bool lmContainer::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
+	{
+		VERBOSE(2,"bool lmContextDependent::GetSentenceAndContext" << std::endl);
+		VERBOSE(2,"line:|" << line << "|" << std::endl);
+		bool ret;
+		size_t pos = line.find(context_delimiter);
+		if (pos != std::string::npos){ // context_delimiter is found
+			sentence = line.substr(0, pos);
+			line.erase(0, pos + context_delimiter.length());
+			
+			//getting context string;
+			context = line;
+			ret=true;
+		}else{
+			sentence = line;
+			context = "";
+			ret=false;
+		}
+		VERBOSE(2,"sentence:|" << sentence << "|" << std::endl);
+		VERBOSE(2,"context:|" << context << "|" << std::endl);
+		return ret;
+	}
+	
+	void lmContainer::setContextMap(topic_map_t& topic_map, const std::string& context){
+		
+		string_vec_t topic_weight_vec;
+		string_vec_t topic_weight;
+		
+		// context is supposed in this format
+		// topic-name1,topic-value1:topic-name2,topic-value2:...:topic-nameN,topic-valueN
+		
+		//first-level split the context in a vector of  topic-name1,topic-value1, using the first separator ':'
+		split(context, topic_map_delimiter1, topic_weight_vec);
+		for (string_vec_t::iterator it=topic_weight_vec.begin(); it!=topic_weight_vec.end(); ++it){
+			//first-level split the context in a vector of  topic-name1 and ,topic-value1, using the second separator ','
+			split(*it, topic_map_delimiter2, topic_weight);
+			topic_map[topic_weight.at(0)] = strtod (topic_weight.at(1).c_str(), NULL);
+			topic_weight.clear();
+		}
+	}
+	
+	
 }//namespace irstlm
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 8881617..b40be41 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -28,7 +28,7 @@
 #define _IRSTLM_LMMACRO 2
 #define _IRSTLM_LMCLASS 3
 #define _IRSTLM_LMINTERPOLATION 4
-
+#define _IRSTLM_LMCONTEXTDEPENDENT 5
 
 #include <stdio.h>
 #include <cstdlib>
@@ -46,6 +46,16 @@ typedef enum {LMT_FIND,    //!< search: find an entry
 } LMT_ACTION;
 
 namespace irstlm {
+	static const std::string context_delimiter="___CONTEXT___";
+	static const std::string lexicon_delimiter="___LEXICON___";
+  static const char topic_map_delimiter1=':';
+	static const char topic_map_delimiter2=',';
+//  #define topic_map_delimiter1 ':'
+//	#define topic_map_delimiter2 ','
+	
+	
+	typedef std::map< std::string, float > topic_map_t;
+	
 	class lmContainer
 	{
 		static const bool debug=true;
@@ -57,6 +67,10 @@ namespace irstlm {
 		int          maxlev; //maximun order of sub LMs;
 		int  requiredMaxlev; //max loaded level, i.e. load up to requiredMaxlev levels
 		
+		bool m_isadaptive; //flag is true if the LM can be adapted by means of any external context
+		void isAdaptive(bool val){ m_isadaptive = val; }
+		bool isAdaptive(){ return m_isadaptive;}
+		
 	public:
 		
 		lmContainer();
@@ -127,7 +141,9 @@ namespace irstlm {
 		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, NULL, NULL, NULL); }
 		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); }
 		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); };
-		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); }
+		
+//		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow){return 0.0;};
+		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow);
 		
 		virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
 		{
@@ -139,6 +155,39 @@ namespace irstlm {
 			return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
 		};
 		
+		virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { return clprob(ng, topic_weights, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); };		
+		virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+		{
+			UNUSED(topic_weights);
+			return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+		}
+		virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+		{
+			//create the actual ngram
+			ngram ong(getDict());
+			ong.pushc(ng,ngsize);
+			MY_ASSERT (ong.size == ngsize);
+			
+			return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+		}
+		virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+		{
+			UNUSED(text);
+			UNUSED(bow);
+			UNUSED(bol);
+			UNUSED(maxsuffidx);
+			UNUSED(maxsuffptr);
+			UNUSED(statesize);
+			UNUSED(extendible);
+			UNUSED(lastbow);
+			return 0.0;
+		};
+		virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL)
+		{
+			UNUSED(topic_weights);
+			return clprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+		}
+		
 		virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
 		{
 			UNUSED(ng);
@@ -249,6 +298,12 @@ namespace irstlm {
 			VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
 		};
 		
+		inline std::string getContextDelimiter() const{ return context_delimiter; }
+		
+		bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
+		
+		void setContextMap(topic_map_t& topic_map, const std::string& context);
+		
 	};
 	
 }//namespace irstlm
diff --git a/src/lmtable.h b/src/lmtable.h
index e7cba2d..606a76d 100644
--- a/src/lmtable.h
+++ b/src/lmtable.h
@@ -329,7 +329,6 @@ namespace irstlm {
 		*/
 
 		virtual double  lprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
-		
 		virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
 
 		virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
diff --git a/src/util.cpp b/src/util.cpp
index 77b8972..24220bf 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -364,6 +364,16 @@ namespace irstlm {
 			return NULL;
 		}
 	}
+
+	string_vec_t &split(const std::string &s, const char delim, string_vec_t &elems) {
+		std::stringstream ss(s);
+		std::string item;
+		while (std::getline(ss, item, delim)) {
+			elems.push_back(item);
+		}
+		return elems;
+	}
+	
 	
 }
 
diff --git a/src/util.h b/src/util.h
index 3db4416..23609d8 100644
--- a/src/util.h
+++ b/src/util.h
@@ -3,10 +3,11 @@
 #ifndef IRSTLM_UTIL_H
 #define IRSTLM_UTIL_H
 
-
 #include <string>
 #include <iostream>
 #include <fstream>
+#include <vector>
+#include <map>
 #include <assert.h>
 
 using namespace std;
@@ -49,9 +50,14 @@ using namespace std;
 #define BUCKET 10000
 #define SSEED 50
 
-class ngram;
+typedef std::vector< std::string > string_vec_t;
+typedef std::vector< double > double_vec_t;
+typedef std::vector< float > float_vec_t;
+typedef std::map< std::string, float > topic_map_t;
+
 typedef unsigned int  ngram_state_t; //type for pointing to a full ngram in the table
 
+class ngram;
 class mfstream;
 
 std::string gettempfolder();
@@ -74,9 +80,12 @@ void ShowProgress(long long current,long long total);
 int parseWords(char *, const char **, int);
 int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow);
 
+//split a string into a vector of string according to one specified delimiter (char)
+
 void exit_error(int err, const std::string &msg="");
 
 namespace irstlm{
+  string_vec_t &split(const std::string &s, const char delim, string_vec_t &elems);
 	void* reallocf(void *ptr, size_t size);
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list