[irstlm] 14/78: added functions to handle with context weights
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:01 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit ed6d2bc460e08028fa5de93a1b5b1e874babb2bf
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Sun Nov 8 18:38:19 2015 +0100
added functions to handle with context weights
---
src/lmContainer.cpp | 42 ++++++++++++++++++++++++++++++++++++++
src/lmContainer.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
src/lmtable.h | 1 -
src/util.cpp | 10 +++++++++
src/util.h | 13 ++++++++++--
5 files changed, 120 insertions(+), 5 deletions(-)
diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index bde6996..afdd77c 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -172,4 +172,46 @@ namespace irstlm {
return false;
};
+ bool lmContainer::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
+ {
+ VERBOSE(2,"bool lmContextDependent::GetSentenceAndContext" << std::endl);
+ VERBOSE(2,"line:|" << line << "|" << std::endl);
+ bool ret;
+ size_t pos = line.find(context_delimiter);
+ if (pos != std::string::npos){ // context_delimiter is found
+ sentence = line.substr(0, pos);
+ line.erase(0, pos + context_delimiter.length());
+
+ //getting context string;
+ context = line;
+ ret=true;
+ }else{
+ sentence = line;
+ context = "";
+ ret=false;
+ }
+ VERBOSE(2,"sentence:|" << sentence << "|" << std::endl);
+ VERBOSE(2,"context:|" << context << "|" << std::endl);
+ return ret;
+ }
+
+ void lmContainer::setContextMap(topic_map_t& topic_map, const std::string& context){
+
+ string_vec_t topic_weight_vec;
+ string_vec_t topic_weight;
+
+ // context is supposed in this format
+ // topic-name1,topic-value1:topic-name2,topic-value2:...:topic-nameN,topic-valueN
+
+ //first-level split the context in a vector of topic-name1,topic-value1, using the first separator ':'
+ split(context, topic_map_delimiter1, topic_weight_vec);
+ for (string_vec_t::iterator it=topic_weight_vec.begin(); it!=topic_weight_vec.end(); ++it){
+ //first-level split the context in a vector of topic-name1 and ,topic-value1, using the second separator ','
+ split(*it, topic_map_delimiter2, topic_weight);
+ topic_map[topic_weight.at(0)] = strtod (topic_weight.at(1).c_str(), NULL);
+ topic_weight.clear();
+ }
+ }
+
+
}//namespace irstlm
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 8881617..b40be41 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -28,7 +28,7 @@
#define _IRSTLM_LMMACRO 2
#define _IRSTLM_LMCLASS 3
#define _IRSTLM_LMINTERPOLATION 4
-
+#define _IRSTLM_LMCONTEXTDEPENDENT 5
#include <stdio.h>
#include <cstdlib>
@@ -46,6 +46,16 @@ typedef enum {LMT_FIND, //!< search: find an entry
} LMT_ACTION;
namespace irstlm {
+ static const std::string context_delimiter="___CONTEXT___";
+ static const std::string lexicon_delimiter="___LEXICON___";
+ static const char topic_map_delimiter1=':';
+ static const char topic_map_delimiter2=',';
+// #define topic_map_delimiter1 ':'
+// #define topic_map_delimiter2 ','
+
+
+ typedef std::map< std::string, float > topic_map_t;
+
class lmContainer
{
static const bool debug=true;
@@ -57,6 +67,10 @@ namespace irstlm {
int maxlev; //maximun order of sub LMs;
int requiredMaxlev; //max loaded level, i.e. load up to requiredMaxlev levels
+ bool m_isadaptive; //flag is true if the LM can be adapted by means of any external context
+ void isAdaptive(bool val){ m_isadaptive = val; }
+ bool isAdaptive(){ return m_isadaptive;}
+
public:
lmContainer();
@@ -127,7 +141,9 @@ namespace irstlm {
virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, NULL, NULL, NULL); }
virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); }
virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); };
- virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); }
+
+// virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow){return 0.0;};
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow);
virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
{
@@ -139,6 +155,39 @@ namespace irstlm {
return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
};
+ virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { return clprob(ng, topic_weights, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); };
+ virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ UNUSED(topic_weights);
+ return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+ virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+
+ return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+ virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ UNUSED(text);
+ UNUSED(bow);
+ UNUSED(bol);
+ UNUSED(maxsuffidx);
+ UNUSED(maxsuffptr);
+ UNUSED(statesize);
+ UNUSED(extendible);
+ UNUSED(lastbow);
+ return 0.0;
+ };
+ virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL)
+ {
+ UNUSED(topic_weights);
+ return clprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+
virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
{
UNUSED(ng);
@@ -249,6 +298,12 @@ namespace irstlm {
VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
};
+ inline std::string getContextDelimiter() const{ return context_delimiter; }
+
+ bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
+
+ void setContextMap(topic_map_t& topic_map, const std::string& context);
+
};
}//namespace irstlm
diff --git a/src/lmtable.h b/src/lmtable.h
index e7cba2d..606a76d 100644
--- a/src/lmtable.h
+++ b/src/lmtable.h
@@ -329,7 +329,6 @@ namespace irstlm {
*/
virtual double lprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
-
virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
diff --git a/src/util.cpp b/src/util.cpp
index 77b8972..24220bf 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -364,6 +364,16 @@ namespace irstlm {
return NULL;
}
}
+
+ string_vec_t &split(const std::string &s, const char delim, string_vec_t &elems) {
+ std::stringstream ss(s);
+ std::string item;
+ while (std::getline(ss, item, delim)) {
+ elems.push_back(item);
+ }
+ return elems;
+ }
+
}
diff --git a/src/util.h b/src/util.h
index 3db4416..23609d8 100644
--- a/src/util.h
+++ b/src/util.h
@@ -3,10 +3,11 @@
#ifndef IRSTLM_UTIL_H
#define IRSTLM_UTIL_H
-
#include <string>
#include <iostream>
#include <fstream>
+#include <vector>
+#include <map>
#include <assert.h>
using namespace std;
@@ -49,9 +50,14 @@ using namespace std;
#define BUCKET 10000
#define SSEED 50
-class ngram;
+typedef std::vector< std::string > string_vec_t;
+typedef std::vector< double > double_vec_t;
+typedef std::vector< float > float_vec_t;
+typedef std::map< std::string, float > topic_map_t;
+
typedef unsigned int ngram_state_t; //type for pointing to a full ngram in the table
+class ngram;
class mfstream;
std::string gettempfolder();
@@ -74,9 +80,12 @@ void ShowProgress(long long current,long long total);
int parseWords(char *, const char **, int);
int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow);
+//split a string into a vector of string according to one specified delimiter (char)
+
void exit_error(int err, const std::string &msg="");
namespace irstlm{
+ string_vec_t &split(const std::string &s, const char delim, string_vec_t &elems);
void* reallocf(void *ptr, size_t size);
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list