[irstlm] 124/126: enabled the possibility of changing (via input and per each sentence) the lexicon used for computing the approximated-Perplexity
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:52 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 63f49f213a72dfbc578b27215d5e4c19eb0df86d
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Wed Oct 28 09:28:59 2015 +0100
enabled the possibility of changing (via input and per each sentence) the lexicon used for computing the approximated-Perplexity
---
src/context-dependent-evaluation.cpp | 66 ++++++++++++++++++++++++++++++------
src/lmContainer.h | 8 +++--
src/lmContextDependent.cpp | 24 +++++++++++++
src/lmContextDependent.h | 11 +++---
4 files changed, 91 insertions(+), 18 deletions(-)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 5a60515..23b815a 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -71,8 +71,7 @@ void print_help(int TypeFlag=0){
FullPrintParams(TypeFlag, 0, 1, stderr);
}
-void usage(const char *msg = 0)
-{
+void usage(const char *msg = 0){
if (msg) {
std::cerr << msg << std::endl;
}
@@ -81,6 +80,16 @@ void usage(const char *msg = 0)
}
}
+void load_lexicon(const char* lexfile, std::multimap< std::string, std::string >& lexicon){
+ if (lexfile!= NULL) {
+ fstream inp(lexfile,ios::in|ios::binary);
+ std::string w1, w2;
+ while (inp >> w1 >> w2){
+ lexicon.insert(make_pair(w1,w2));
+ }
+ }
+}
+
int main(int argc, char **argv)
{
char *testfile=NULL;
@@ -128,9 +137,10 @@ int main(int argc, char **argv)
"ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
"context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent model (default is true)",
"context_model_normalization", CMDBOOLTYPE|CMDMSG, &context_model_normalization, "enable/disable normalization of context-dependent model (default is false)",
+ "add_lexicon_words", CMDBOOLTYPE|CMDMSG, &add_lexicon_words, "enable/disable addition of the words in the lexicon into the alternatives (default is false)",
"add_lm_words", CMDBOOLTYPE|CMDMSG, &add_lm_words, "enable/disable addition of the unigram/bigrmam successors into the alternatives (default is false)",
"add_sentence_words", CMDBOOLTYPE|CMDMSG, &add_sentence_words, "enable/disable addition of the words of the current sentence into the alternatives (default is false)",
- "add_full_dictionary", CMDBOOLTYPE|CMDMSG, &add_full_dictionary, "enable/disable addition of all words of the dictionary into the alternatives (default is false)",
+ "add_full_dictionary", CMDBOOLTYPE|CMDMSG, &add_full_dictionary, "enable/disable addition of all words of the LM dictionary into the alternatives (default is false)",
"successor_limit", CMDINTTYPE|CMDMSG, &successor_limit, "threshold to decide whether adding the unigram/bigram successors into the alternatives (default is 100)",
"Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
@@ -163,8 +173,9 @@ int main(int argc, char **argv)
if (lmfile!=NULL) VERBOSE(1, "lmfile: " << lmfile << std::endl);
if (testfile!=NULL) VERBOSE(1, "testfile: " << testfile << std::endl);
- if (lexiconfile != NULL) VERBOSE(1, "lexicon: " << lexiconfile << std::endl);
-
+ if (lexiconfile != NULL){
+ VERBOSE(1, "lexicon: " << lexiconfile << std::endl);
+ }
VERBOSE(1, "contextbasedscore: " << contextbasedscore << std::endl);
VERBOSE(1, "topicscore: " << topicscore << std::endl);
VERBOSE(1, "rankscore: " << rankscore << std::endl);
@@ -199,8 +210,19 @@ int main(int argc, char **argv)
lmt->init_caches(lmt->maxlevel());
//read lexicon form file
+
std::multimap< std::string, std::string > lexicon;
- if (lexiconfile != NULL) {
+ if (add_lexicon_words){
+ if (lexiconfile != NULL) {
+ load_lexicon(lexiconfile, lexicon);
+ }else{
+ VERBOSE(1, "You did not set any lexicon, but you activated parameter \"--add_lexicon_words\". This is formally correct; maybe you want to pass the lexicon through the input; Please check whether your setting is correct." << std::endl);
+ }
+ }else{
+ VERBOSE(1, "You set a lexicon, but you did not activate parameter \"--add_lexicon_words\". Hence, words in he lexicon are not used as alternatives" << std::endl);
+ }
+ /*
+ if (std::string lexiconfile!= NULL) {
fstream inp(lexiconfile,ios::in|ios::binary);
std::string w1, w2;
while (inp >> w1 >> w2){
@@ -208,6 +230,7 @@ int main(int argc, char **argv)
}
add_lexicon_words=true;
}
+ */
if (topicscore == true) {
if (lmt->getLanguageModelType() != _IRSTLM_LMCONTEXTDEPENDENT) {
@@ -242,10 +265,14 @@ int main(int argc, char **argv)
VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);
//getting sentence string;
+ std::string tmp_sentence;
std::string sentence;
std::string context;
+ std::string sentence_lexiconfile;
- bool withContext = lmt->GetSentenceAndContext(sentence,context,line_str);
+ //remove lexicon string from the input, even if it is not used at all for this type of score
+ ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+ bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
//getting apriori topic weights
topic_map_t apriori_topic_map;
@@ -370,10 +397,14 @@ int main(int argc, char **argv)
VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);
//getting sentence string;
+ std::string tmp_sentence;
std::string sentence;
std::string context;
-
- bool withContext = lmt->GetSentenceAndContext(sentence,context,line_str);
+ std::string sentence_lexiconfile;
+
+ bool withLexicon = ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+ bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+
//getting apriori topic weights
topic_map_t apriori_topic_map;
if (withContext){
@@ -460,6 +491,12 @@ int main(int argc, char **argv)
//add words from the lexicon
if (add_lexicon_words){
+
+ if (withLexicon){
+ lexicon.clear();
+ load_lexicon(sentence_lexiconfile.c_str(), lexicon);
+ }
+
std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
{
@@ -784,10 +821,13 @@ int main(int argc, char **argv)
VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);
//getting sentence string;
+ std::string tmp_sentence;
std::string sentence;
std::string context;
+ std::string sentence_lexiconfile;
- bool withContext=lmt->GetSentenceAndContext(sentence,context,line_str);
+ bool withLexicon = ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+ bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
//getting apriori topic weights
topic_map_t apriori_topic_map;
@@ -873,6 +913,12 @@ int main(int argc, char **argv)
//add words from the lexicon
if (add_lexicon_words){
+
+ if (withLexicon){
+ lexicon.clear();
+ load_lexicon(sentence_lexiconfile.c_str(), lexicon);
+ }
+
std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
{
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 5fb8b2e..bb91426 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -48,10 +48,11 @@ typedef enum {LMT_FIND, //!< search: find an entry
} LMT_ACTION;
namespace irstlm {
-
+
+ static const std::string context_delimiter="___CONTEXT___";
+ static const std::string lexicon_delimiter="___LEXICON___";
typedef std::map< std::string, float > topic_map_t;
-// typedef std::map< std::string, double > lm_map_t;
class lmContainer
{
@@ -254,6 +255,9 @@ public:
return is_lmt_cache_enabled() && is_ps_cache_enabled();
}
+
+ inline std::string getContextDelimiter() const{ return context_delimiter; }
+
bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
void setContextMap(topic_map_t& topic_map, const std::string& context);
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 1df595a..e385b24 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -265,4 +265,28 @@ namespace irstlm {
logOOVpenalty=log(m_lm->getlogOOVpenalty());
return logOOVpenalty;
}
+
+ bool lmContextDependent::GetSentenceAndLexicon(std::string& sentence, std::string& lexiconfile, std::string& line)
+ {
+ VERBOSE(2,"bool lmContextDependent::GetSentenceAndLexicon" << std::endl);
+ VERBOSE(2,"line:|" << line << "|" << std::endl);
+ bool ret;
+ size_t pos = line.find(lexicon_delimiter);
+ if (pos != std::string::npos){ // lexicon_delimiter is found
+ sentence = line.substr(0, pos);
+ line.erase(0, pos + lexicon_delimiter.length());
+
+ //getting context string;
+ lexiconfile = line;
+ ret=true;
+ }else{
+ sentence = line;
+ lexiconfile = "";
+ ret=false;
+ }
+ VERBOSE(2,"sentence:|" << sentence << "|" << std::endl);
+ VERBOSE(2,"lexicon:|" << lexiconfile << "|" << std::endl);
+ return ret;
+ }
+
}//namespace irstlm
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index c017908..782dc68 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -60,8 +60,6 @@ namespace irstlm {
#define LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN 6
- static const std::string context_delimiter="___CONTEXT___";
-
class lmContextDependent: public lmContainer
{
private:
@@ -96,10 +94,9 @@ namespace irstlm {
virtual ~lmContextDependent();
void load(const std::string &filename,int mmap=0);
-
-
- inline std::string getContextDelimiter() const{
- return context_delimiter;
+
+ inline std::string getLexiconDelimiter() const{
+ return lexicon_delimiter;
}
virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
@@ -215,6 +212,8 @@ namespace irstlm {
m_normalization = val;
}
+ bool GetSentenceAndLexicon(std::string& sentence, std::string& lexiconfile, std::string& line);
+
};
}//namespace irstlm
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list