[irstlm] 23/78: enabled a new context-dependent LM model based on a correction factor of basic LM probabilities
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:02 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit 3d48f1353cceebade3677102ffc0cc218528f02c
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Mon Nov 9 00:45:36 2015 +0100
enabled a new context-dependent LM model based on a correction factor of basic LM probabilities
---
src/CMakeLists.txt | 5 +-
src/Makefile.am | 13 +-
src/context-dependent-evaluation.cpp | 1098 ++++++++++++++++++++++++++++++++++
src/context-similarity.cpp | 559 +++++++++++++++++
src/context-similarity.h | 138 +++++
src/lmContainer.cpp | 8 +
6 files changed, 1817 insertions(+), 4 deletions(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cef6d31..2717ce9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,6 +6,7 @@ ADD_DEFINITIONS("-D_LARGE_FILES")
ADD_DEFINITIONS("-D_FILE_OFFSET_BITS=64")
ADD_DEFINITIONS("-DMYCODESIZE=3")
ADD_DEFINITIONS("-DDEBUG")
+#ADD_DEFINITIONS("-DTRACE_LEVEL=1")
if (TRACE_LEVEL)
ADD_DEFINITIONS("-DTRACE_LEVEL=${TRACE_LEVEL}")
@@ -62,12 +63,14 @@ SET( LIB_IRSTLM_SRC
cplsa.h cplsa.cpp
cswam.h cswam.cpp
doc.h doc.cpp
+ lmContextDependent.h lmContextDependent.cpp
+ context-similarity.h context-similarity.cpp
)
ADD_LIBRARY(irstlm STATIC ${LIB_IRSTLM_SRC})
LINK_DIRECTORIES (${LIBRARY_OUTPUT_PATH})
-FOREACH(CMD dict ngt tlm dtsel plsa cswa compile-lm interpolate-lm prune-lm quantize-lm score-lm verify-caching)
+FOREACH(CMD dict ngt tlm dtsel plsa cswa compile-lm interpolate-lm prune-lm quantize-lm score-lm verify-caching context-dependent-evaluation)
ADD_EXECUTABLE(${CMD} ${CMD}.cpp)
TARGET_LINK_LIBRARIES (${CMD} irstlm -lm -lz -lpthread)
diff --git a/src/Makefile.am b/src/Makefile.am
index 6169e64..6f54075 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -32,7 +32,9 @@ libirstlm_la_HEADERS = \
shiftlm.h \
cplsa.h \
cswam.h \
- doc.h
+ doc.h \
+ lmContextDependent.h \
+ context-similarity.h
libirstlm_la_SOURCES = \
cmd.c \
@@ -60,7 +62,9 @@ libirstlm_la_SOURCES = \
shiftlm.cpp \
cplsa.cpp \
cswam.cpp \
- doc.cpp
+ doc.cpp \
+ lmContextDependent.cpp \
+ context-similarity.cpp
CLEANFILES = $(BUILT_SOURCES)
@@ -69,7 +73,8 @@ libirstlm_la_LIBADD = $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB)
LDADD = -lirstlm -lpthread
DEPENDENCIES = libirstlm.la
-bin_PROGRAMS = dict ngt dtsel compile-lm interpolate-lm prune-lm quantize-lm prune-lm score-lm tlm plsa verify-caching cswa
+bin_PROGRAMS = dict ngt dtsel compile-lm interpolate-lm prune-lm quantize-lm prune-lm score-lm tlm plsa verify-caching cswa context-dependent-evaluation
+
dict_SOURCES = dict.cpp
dict_DEPENDENCIES = $(DEPENDENCIES)
ngt_SOURCES = ngt.cpp
@@ -94,3 +99,5 @@ verify_caching_SOURCES = verify-caching.cpp
verify_caching_DEPENDENCIES = $(DEPENDENCIES)
cswa_SOURCES = cswa.cpp
cswa_DEPENDENCIES = $(DEPENDENCIES)
+context_dependent_evaluation_SOURCES = context-dependent-evaluation.cpp
+context_dependent_evaluation_DEPENDENCIES = $(DEPENDENCIES)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
new file mode 100644
index 0000000..0b32f27
--- /dev/null
+++ b/src/context-dependent-evaluation.cpp
@@ -0,0 +1,1098 @@
+// $Id: compile-lm.cpp 3677 2010-10-13 09:06:51Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit, compile LM
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <stdlib.h>
+#include "cmd.h"
+#include "util.h"
+#include "math.h"
+#include "lmContainer.h"
+#include "lmContextDependent.h"
+
+using namespace std;
+using namespace irstlm;
+
+typedef std::pair<double,int> double_and_int_pair;
+
+struct cmp_double_and_int_pair {
+ //order first by the first field (double), and in case of equality by the second field (int)
+ bool operator()(const double_and_int_pair& a, const double_and_int_pair& b) const {
+ if (a.first < b.first){
+ return true;
+ }else if (a.first > b.first){
+ return false;
+ }else{
+ if (a.second<b.second){
+ return true;
+ }else{
+ return false;
+ }
+ }
+ }
+};
+
+typedef std::map<int, double_and_int_pair> int_to_double_and_int_map;
+//typedef std::map<double_and_int_pair,int,cmp_double_and_int_pair> double_and_int_to_int_map;
+typedef std::map<double_and_int_pair,double_and_int_pair,cmp_double_and_int_pair> double_and_int_to_double_and_int_map;
+
+/********************************/
+void print_help(int TypeFlag=0){
+ std::cerr << std::endl << "context-dependent-evaluation - compute ngram probabilities and text perplexity given a LM" << std::endl;
+ std::cerr << std::endl << "USAGE:" << std::endl;
+ std::cerr << " context-dependent-evaluation [options] lm=<input-file.lm>" << std::endl;
+ std::cerr << std::endl << "DESCRIPTION:" << std::endl;
+ std::cerr << " context-dependent-evaluation uses the given LM to compute ngram probabilities and text perplexity of the input" << std::endl;
+ std::cerr << " The LM must be in a IRSTLM-compliant type" << std::endl;
+ std::cerr << std::endl << "OPTIONS:" << std::endl;
+
+ FullPrintParams(TypeFlag, 0, 1, stderr);
+}
+
+void usage(const char *msg = 0){
+ if (msg) {
+ std::cerr << msg << std::endl;
+ }
+ if (!msg){
+ print_help();
+ }
+}
+
+void load_lexicon(const char* lexfile, std::multimap< std::string, std::string >& lexicon){
+ if (lexfile!= NULL) {
+ fstream inp(lexfile,ios::in|ios::binary);
+ std::string w1, w2;
+ while (inp >> w1 >> w2){
+ lexicon.insert(make_pair(w1,w2));
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ char *testfile=NULL;
+ char *lmfile=NULL;
+
+ bool sent_flag = false;
+ bool contextbasedscore = false;
+ bool topicscore = false;
+ bool rankscore = false;
+ bool context_model_active = true;
+ bool context_model_normalization = false;
+ char *lexiconfile=NULL;
+
+ bool add_lexicon_words = false;
+ bool add_lm_words = false;
+ bool add_sentence_words = false;
+ bool add_full_dictionary = false;
+ int successor_limit=100;
+
+ int debug = 0;
+ int requiredMaxlev = 1000;
+ int dub = 10000000;
+ int randcalls = 0;
+ float ngramcache_load_factor = 0.0;
+ float dictionary_load_factor = 0.0;
+
+ bool help=false;
+
+ DeclareParams((char*)
+ "lm", CMDSTRINGTYPE|CMDMSG, &lmfile, "LM to load",
+ "test", CMDSTRINGTYPE|CMDMSG, &testfile, "computes scores of the specified text file",
+ "lexicon", CMDSTRINGTYPE|CMDMSG, &lexiconfile, "lexicon file contains associated words (required by rankscore)",
+ "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
+ "r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
+ "contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
+ "topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
+ "rankscore", CMDBOOLTYPE|CMDMSG, &rankscore, "computes the average rank position of the text from standard input",
+ "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
+ "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
+ "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
+ "l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
+ "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
+ "sentence", CMDBOOLTYPE|CMDMSG, &sent_flag, "computes perplexity at sentence level (identified through the end symbol)",
+ "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0",
+ "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
+ "context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent model (default is true)",
+ "context_model_normalization", CMDBOOLTYPE|CMDMSG, &context_model_normalization, "enable/disable normalization of context-dependent model (default is false)",
+ "add_lexicon_words", CMDBOOLTYPE|CMDMSG, &add_lexicon_words, "enable/disable addition of the words in the lexicon into the alternatives (default is false)",
+ "add_lm_words", CMDBOOLTYPE|CMDMSG, &add_lm_words, "enable/disable addition of the unigram/bigrmam successors into the alternatives (default is false)",
+ "add_sentence_words", CMDBOOLTYPE|CMDMSG, &add_sentence_words, "enable/disable addition of the words of the current sentence into the alternatives (default is false)",
+ "add_full_dictionary", CMDBOOLTYPE|CMDMSG, &add_full_dictionary, "enable/disable addition of all words of the LM dictionary into the alternatives (default is false)",
+ "successor_limit", CMDINTTYPE|CMDMSG, &successor_limit, "threshold to decide whether adding the unigram/bigram successors into the alternatives (default is 100)",
+
+ "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
+ "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
+
+ (char*)NULL
+ );
+
+ if (argc == 1){
+ usage();
+ exit_error(IRSTLM_NO_ERROR);
+ }
+
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (help){
+ usage();
+ exit_error(IRSTLM_NO_ERROR);
+ }
+
+ if (lmfile == NULL) {
+ usage();
+ exit_error(IRSTLM_ERROR_DATA,"Warning: Please specify a LM file to read from");
+ }
+
+ if (testfile == NULL) {
+ usage();
+ exit_error(IRSTLM_NO_ERROR);
+ }
+
+ if (lmfile!=NULL) VERBOSE(1, "lmfile: " << lmfile << std::endl);
+ if (testfile!=NULL) VERBOSE(1, "testfile: " << testfile << std::endl);
+ if (lexiconfile != NULL){
+ VERBOSE(1, "lexicon: " << lexiconfile << std::endl);
+ }
+ VERBOSE(1, "contextbasedscore: " << contextbasedscore << std::endl);
+ VERBOSE(1, "topicscore: " << topicscore << std::endl);
+ VERBOSE(1, "rankscore: " << rankscore << std::endl);
+
+ VERBOSE(1,"add_lexicon_words: " << add_lexicon_words << std::endl);
+ VERBOSE(1,"add_lm_words: " << add_lm_words << " successor_limit:" << successor_limit<< std::endl);
+ VERBOSE(1,"add_sentence_words: " << add_sentence_words << std::endl);
+
+ std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
+ std::cerr << "dub: " << dub<< std::endl;
+
+
+ if (topicscore == true) {
+ VERBOSE(0, "NOT SUPPORTED" << std::endl);
+ return 0;
+ }
+
+ //checking the language model type
+ std::string infile(lmfile);
+
+ lmContainer* lmt = lmContainer::CreateLanguageModel(infile,ngramcache_load_factor,dictionary_load_factor);
+
+ lmt->setMaxLoadedLevel(requiredMaxlev);
+
+ lmt->load(infile);
+ ((lmContextDependent*) lmt)->set_Active(context_model_active);
+ ((lmContextDependent*) lmt)->set_Normalized(context_model_normalization);
+
+ if (dub) lmt->setlogOOVpenalty((int)dub);
+
+ //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags)
+ lmt->init_caches(lmt->maxlevel());
+
+ //read lexicon form file
+
+ std::multimap< std::string, std::string > lexicon;
+ if (add_lexicon_words){
+ if (lexiconfile != NULL) {
+ load_lexicon(lexiconfile, lexicon);
+ }else{
+ VERBOSE(1, "You did not set any lexicon, but you activated parameter \"--add_lexicon_words\". This is formally correct; maybe you want to pass the lexicon through the input; Please check whether your setting is correct." << std::endl);
+ }
+ }else{
+ VERBOSE(1, "You set a lexicon, but you did not activate parameter \"--add_lexicon_words\". Hence, words in he lexicon are not used as alternatives" << std::endl);
+ }
+ /*
+ if (std::string lexiconfile!= NULL) {
+ fstream inp(lexiconfile,ios::in|ios::binary);
+ std::string w1, w2;
+ while (inp >> w1 >> w2){
+ lexicon.insert(make_pair(w1,w2));
+ }
+ add_lexicon_words=true;
+ }
+ */
+
+ if (topicscore == true) {
+ if (lmt->getLanguageModelType() != _IRSTLM_LMCONTEXTDEPENDENT) {
+ exit_error(IRSTLM_ERROR_DATA, "This type of score is not available for the LM loaded");
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+
+ std::cerr << "Start Topic Score generation " << std::endl;
+ std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ std::fstream inptxt(testfile,std::ios::in);
+
+ // loop over input lines
+ char line[MAX_LINE];
+ while (inptxt.getline(line,MAX_LINE)) {
+
+ std::string line_str = line;
+
+ VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);
+
+ //getting sentence string;
+ std::string tmp_sentence;
+ std::string sentence;
+ std::string context;
+ std::string sentence_lexiconfile;
+
+ //remove lexicon string from the input, even if it is not used at all for this type of score
+ ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+ bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+
+ //getting apriori topic weights
+ topic_map_t apriori_topic_map;
+ if (withContext){
+ lmt->setContextMap(apriori_topic_map,context);
+ }
+ // computation using std::string
+ // loop over ngrams of the sentence
+ string_vec_t word_vec;
+ split(sentence, ' ', word_vec);
+
+ //first points to the last recent term to take into account
+ //last points to the position after the most recent term to take into account
+ //last could point outside the vector of string; do NOT use word_vec.at(last)
+ size_t last, first;
+ size_t size=0;
+ size_t order = lmt->maxlevel();
+
+ topic_map_t sentence_topic_map;
+ for (size_t i=0; i<word_vec.size(); ++i){
+ ++size;
+ size=(size<order)?size:order;
+ last=i+1;
+
+ // reset ngram at begin of sentence
+ if (word_vec.at(i) == lmt->getDict()->BoS()) {
+ size=1;
+ continue;
+ }
+ first = last - size;
+
+ string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+
+ if (size>=1) {
+ VERBOSE(2,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);
+
+ topic_map_t tmp_topic_map;
+ ((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_word_vec, tmp_topic_map);
+ IFVERBOSE(2){
+ VERBOSE(2,"before normalization word-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+ }
+ ((lmContextDependent*) lmt)->getContextSimilarity()->normalize_topic_scores(tmp_topic_map);
+ IFVERBOSE(2){
+ VERBOSE(2,"after normalization word-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+ }
+ VERBOSE(2,"first:" << first << " last:" << last << " tmp_topic_map.size:" << tmp_topic_map.size() << std::endl);
+
+ ((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+ IFVERBOSE(2){
+ VERBOSE(2,"word-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map,apriori_topic_map,1);
+ }
+ tmp_topic_map.clear();
+ }
+ }
+ IFVERBOSE(2){
+ VERBOSE(2,"sentence-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map,last);
+ }
+ std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+ apriori_topic_map.clear();
+ }
+
+ delete lmt;
+ return 0;
+ }
+ if (contextbasedscore == true) {
+
+ if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ std::cerr << "Start ContextBased Evaluation" << std::endl;
+ std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ int Nw=0,Noov=0;
+ double logPr=0,PP=0,PPwp=0,current_Pr;
+ double norm_logPr=0,norm_PP=0,norm_PPwp=0,norm_Pr;
+ double model_logPr=0,model_PP=0,model_PPwp=0,model_Pr;
+ double model_norm_logPr=0,model_norm_PP=0,model_norm_PPwp=0,model_norm_Pr;
+ int current_dict_alternatives = 0;
+
+ double bow;
+ int bol=0;
+ char *msp;
+ ngram_state_t msidx;
+ unsigned int statesize;
+
+ // variables for storing sentence-based Perplexity
+ int sent_Nw=0,sent_Noov=0;
+ double sent_logPr=0,sent_PP=0,sent_PPwp=0;
+ double sent_norm_logPr=0,sent_norm_PP=0,sent_norm_PPwp=0;
+ double sent_model_logPr=0,sent_model_PP=0,sent_model_PPwp=0;
+ double sent_model_norm_logPr=0,sent_model_norm_PP=0,sent_model_norm_PPwp=0;
+ int sent_current_dict_alternatives = 0;
+
+ double oovpenalty = lmt->getlogOOVpenalty();
+ double norm_oovpenalty = oovpenalty;
+
+ VERBOSE(1,"oovpenalty:" << oovpenalty << std::endl);
+
+ std::fstream inptxt(testfile,std::ios::in);
+
+ // loop over input lines
+ char line[MAX_LINE];
+ while (inptxt.getline(line,MAX_LINE)) {
+
+ std::string line_str = line;
+
+ VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);
+
+ //getting sentence string;
+ std::string tmp_sentence;
+ std::string sentence;
+ std::string context;
+ std::string sentence_lexiconfile;
+
+ bool withLexicon = ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+ bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+
+ //getting apriori topic weights
+ topic_map_t apriori_topic_map;
+ if (withContext){
+ ((lmContextDependent*) lmt)->setContextMap(apriori_topic_map,context);
+ }
+ // computation using std::string
+ // loop over ngrams of the sentence
+ string_vec_t word_vec;
+ split(sentence, ' ', word_vec);
+
+ //first points to the last recent term to take into account
+ //last points to the position after the most recent term to take into account
+ //last could point outside the vector of string; do NOT use word_vec.at(last)
+ size_t last, first;
+ size_t order = lmt->maxlevel();
+
+ //start the computation from the second word because the first is the BoS symbol,but including BoS in the ngrams
+ size_t size=0;
+ for (size_t i=0; i< word_vec.size(); ++i){
+ ++size;
+ size=(size<order)?size:order;
+ last=i+1;
+
+ // reset ngram at begin of sentence
+ if (word_vec.at(i) == lmt->getDict()->BoS()) {
+ size=1;
+ continue;
+ }
+ first = last - size;
+
+ string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+
+ if (size>=1) {
+ VERBOSE(2,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);
+
+ VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
+ if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
+ Noov++;
+ sent_Noov++;
+ }
+ Nw++;
+ sent_Nw++;
+
+ if ((Nw % 100000)==0) {
+ std::cerr << ".";
+ lmt->check_caches_levels();
+ }
+
+
+ VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);
+ VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);
+
+ if (withContext){
+ current_Pr = lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize);
+ }else{
+ current_Pr = lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize);
+ }
+ /*
+ double tot_pr = 0.0;
+ if (context_model_normalization){
+ tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
+ }
+ */
+
+ // string_vec_t::iterator it=tmp_word_vec.end()-1;
+ int current_pos = tmp_word_vec.size()-1;
+ std::string current_word = tmp_word_vec.at(current_pos);
+
+ //loop over a set of selected alternative words
+ //populate the dictionary with all words associated with the current word
+
+ dictionary* current_dict;
+ if (add_full_dictionary){
+ //loop over all words in the LM
+ current_dict = lmt->getDict();
+ }else{
+ current_dict = new dictionary((char *)NULL,1000000);
+ }
+ current_dict->incflag(1);
+
+ current_dict->encode(current_word.c_str());
+
+ VERBOSE(2,"after current word current_dict->size:" << current_dict->size() << std::endl);
+
+ //add words from the lexicon
+ if (add_lexicon_words){
+
+ if (withLexicon){
+ lexicon.clear();
+ load_lexicon(sentence_lexiconfile.c_str(), lexicon);
+ }
+
+ std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
+ for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
+ {
+ current_dict->encode((it->second).c_str());
+ /*
+ //exclude the current word from the selected alternative words
+ if (current_word != (it->second).c_str()){
+ current_dict->encode((it->second).c_str());
+ }
+ */
+ }
+ }
+ VERBOSE(2,"after add_lexicon_words current_dict->size:" << current_dict->size() << std::endl);
+
+ if (add_lm_words){
+ bool succ_flag=false;
+ ngram hg(lmt->getDict());
+
+ if (size==1) {
+ hg.pushw(lmt->getDict()->BoS());
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+ }else if (size>=2) {
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+
+ if (!succ_flag && size>=3){
+ hg.size=0;
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-3));
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+ }
+ }
+
+
+ if (succ_flag){
+ ngram ng=hg;
+ lmt->succscan(hg,ng,LMT_INIT,ng.size);
+ while(lmt->succscan(hg,ng,LMT_CONT,ng.size)) {
+ current_dict->encode(ng.dict->decode(*ng.wordp(1)));
+ }
+ }
+
+ }
+ VERBOSE(2,"after add_lm_words current_dict->size:" << current_dict->size() << std::endl);
+
+ if (add_sentence_words){
+ for (string_vec_t::const_iterator it=word_vec.begin(); it!=word_vec.end(); ++it)
+ {
+ current_dict->encode(it->c_str());
+ }
+ }
+ current_dict->incflag(0);
+ VERBOSE(2,"after add_sentence_words current_dict->size:" << current_dict->size() << std::endl);
+
+ sent_current_dict_alternatives += current_dict->size();
+ current_dict_alternatives += current_dict->size();
+
+ VERBOSE(2,"current_dict->size:" << current_dict->size() << std::endl);
+ for (int h=0;h<current_dict->size();++h){
+ VERBOSE(3,"h:" << h << " w:|" << current_dict->decode(h) << "|" << std::endl);
+ }
+
+ //the first word in current_dict is always the current_word; hence we can skip it during the scan
+ //variables for the computation of the oracle probability, i.e. the maximum prob
+ //double best_pr = -1000000.0;
+ //int best_code = lmt->getlogOOVpenalty();
+ double best_pr = current_Pr;
+ int best_code = 0;
+ //variables for the computation of the mass probability related to the current word, i.e. the sum of the probs for all words associated with the current word
+ double current_tot_pr = pow(10.0,current_Pr);
+ // for (int j=0; j<current_dict->size(); ++j){
+ for (int j=1; j<current_dict->size(); ++j)
+ {
+ //loop over all words in the LM
+ tmp_word_vec.at(current_pos) = current_dict->decode(j);
+ IFVERBOSE(3){
+ std::cout << "tmp_word_vec j:" << j;
+ for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
+ std::cout << " |" << (*it2) << "|";
+ }
+ std::cout << std::endl;
+ }
+
+ double pr;
+ if (withContext){
+ pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize, NULL, NULL);
+ }else{
+ pr=lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize, NULL, NULL);
+ }
+ current_tot_pr += pow(10.0,pr);
+ if (best_pr < pr){
+ best_pr = pr;
+ best_code = j;
+ VERBOSE(3,"current_best:" << best_code << " current_word:|" << current_dict->decode(best_code) << "| best_prob:" << pow(10.0,best_pr) << " norm_best_prob:" << pow(10.0,best_pr - current_tot_pr) << std::endl);
+ }
+ VERBOSE(3,"current_Pr:" << current_Pr << " current_word:" << current_word << "| ===> code:" << j << " word:|" << tmp_word_vec.at(current_pos) << "| pr:" << pr << " versus best_code:" << best_code << " best_word:|" << current_dict->decode(best_code) << "| best_pr:" << best_pr << std::endl);
+ }
+
+ current_tot_pr=log10(current_tot_pr);
+
+ model_Pr = best_pr;
+ VERBOSE(2,"model_best_code:" << best_code << " model_best_word:|" << current_dict->decode(best_code) << "| model_best_prob:" << pow(10.0,best_pr) << " current_tot_pr:" << current_tot_pr << std::endl);
+
+ norm_oovpenalty = oovpenalty;
+ VERBOSE(2,"current_tot_pr:" << current_tot_pr << " oovpenalty:" << oovpenalty << " norm_oovpenalty:" << norm_oovpenalty << std::endl);
+
+ norm_Pr = current_Pr - current_tot_pr;
+ model_norm_Pr = model_Pr - current_tot_pr;
+ VERBOSE(1,"current_Pr:" << current_Pr << " norm_Pr:" << norm_Pr << " model_Pr:" << model_Pr << " model_norm_Pr:" << model_norm_Pr << " current_code:" << lmt->getDict()->encode(word_vec.at(i).c_str()) << " current_word:|" << word_vec.at(i) << "| model_best_code:" << best_code << " model_best_word:|" << current_dict->decode(best_code) << "|" << std::endl);
+
+ model_norm_logPr+=model_norm_Pr;
+ sent_model_norm_logPr+=model_norm_Pr;
+ norm_logPr+=norm_Pr;
+ sent_norm_logPr+=norm_Pr;
+ VERBOSE(2,"sent_model_norm_logPr:" << sent_model_norm_logPr << " model_norm_logPr:" << model_norm_logPr << std::endl);
+ VERBOSE(2,"sent_norm_logPr:" << sent_norm_logPr << " norm_logPr:" << norm_logPr << std::endl);
+
+ model_logPr+=model_Pr;
+ sent_model_logPr+=model_Pr;
+ logPr+=current_Pr;
+ sent_logPr+=current_Pr;
+ VERBOSE(2,"sent_model_logPr:" << sent_model_logPr << " model_logPr:" << model_logPr << std::endl);
+ VERBOSE(2,"sent_logPr:" << sent_logPr << " current_Pr:" << current_Pr << std::endl);
+ delete current_dict;
+ }
+ }
+
+ if (sent_flag) {
+ sent_model_norm_PP = exp((-sent_model_norm_logPr * M_LN10) / sent_Nw);
+ sent_model_norm_PPwp = sent_model_norm_PP * (1 - 1/exp(sent_Noov * norm_oovpenalty * M_LN10 / sent_Nw));
+ sent_norm_PP = exp((-sent_norm_logPr * M_LN10) / sent_Nw);
+ sent_norm_PPwp = sent_norm_PP * (1 - 1/exp(sent_Noov * norm_oovpenalty * M_LN10 / sent_Nw));
+
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_norm_logPr=" << sent_norm_logPr
+ << " sent_norm_PP=" << sent_norm_PP
+ << " sent_norm_PPwp=" << sent_norm_PPwp
+ << " sent_norm_PP_noOOV=" << (sent_norm_PP-sent_norm_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_model_norm_logPr=" << sent_model_norm_logPr
+ << " sent_model_norm_PP=" << sent_model_norm_PP
+ << " sent_model_norm_PPwp=" << sent_model_norm_PPwp
+ << " sent_model_norm_PP_noOOV=" << (sent_model_norm_PP-sent_model_norm_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
+
+ sent_model_PP = exp((-sent_model_logPr * M_LN10) / sent_Nw);
+ sent_model_PPwp = sent_model_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+ sent_PP = exp((-sent_logPr * M_LN10) / sent_Nw);
+ sent_PPwp = sent_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_logPr=" << sent_logPr
+ << " sent_PP=" << sent_PP
+ << " sent_PPwp=" << sent_PPwp
+ << " sent_PP_noOOV=" << (sent_PP-sent_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_model_logPr=" << sent_model_logPr
+ << " sent_model_PP=" << sent_model_PP
+ << " sent_model_PPwp=" << sent_model_PPwp
+ << " sent_model_PP_noOOV=" << (sent_model_PP-sent_model_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
+ std::cout.flush();
+ //reset statistics for sentence based Perplexity
+ sent_Noov = 0;
+ sent_Nw = 0;
+ sent_model_norm_logPr = 0.0;
+ sent_model_logPr = 0.0;
+ sent_norm_logPr = 0.0;
+ sent_logPr = 0.0;
+ sent_current_dict_alternatives = 0;
+ }
+
+ apriori_topic_map.clear();
+ }
+
+ model_norm_PP = exp((-model_norm_logPr * M_LN10) / Nw);
+ model_norm_PPwp = model_norm_PP * (1 - 1/exp(Noov * norm_oovpenalty * M_LN10 / Nw));
+ model_PP = exp((-model_logPr * M_LN10) / Nw);
+ model_PPwp = model_PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
+ norm_PP = exp((-norm_logPr * M_LN10) / Nw);
+ norm_PPwp = norm_PP * (1 - 1/exp(Noov * norm_oovpenalty * M_LN10 / Nw));
+ PP = exp((-logPr * M_LN10) / Nw);
+ PPwp = PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
+
+ std::cout << "%% Nw=" << Nw
+ << " model_logPr=" << model_logPr
+ << " model_PP=" << model_PP
+ << " model_PPwp=" << model_PPwp
+ << " model_PP_noOOV=" << (model_PP-model_PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
+ std::cout.flush();
+
+ std::cout << "%% Nw=" << Nw
+ << " model_norm_logPr=" << model_norm_logPr
+ << " model_norm_PP=" << model_norm_PP
+ << " model_norm_PPwp=" << model_norm_PPwp
+ << " model_norm_PP_noOOV=" << (model_norm_PP-model_norm_PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
+ std::cout.flush();
+
+ std::cout << "%% Nw=" << Nw
+ << " logPr=" << logPr
+ << " PP=" << PP
+ << " PPwp=" << PPwp
+ << " PP_noOOV=" << (PP-PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
+ std::cout.flush();
+
+ std::cout << "%% Nw=" << Nw
+ << " norm_logPr=" << norm_logPr
+ << " norm_PP=" << norm_PP
+ << " norm_PPwp=" << norm_PPwp
+ << " norm_PP_noOOV=" << (norm_PP-norm_PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
+ std::cout.flush();
+
+ if (debug>1) lmt->used_caches();
+
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+ }
+ if (rankscore == true) {
+
+ if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ std::cerr << "Start RankBased Evaluation" << std::endl;
+ std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ int Nw=0,Noov=0;
+ double avgRank;
+ int tot_rank = 0;
+ int max_rank = 0;
+ int current_dict_alternatives = 0;
+
+ double bow;
+ int bol=0;
+ char *msp;
+ ngram_state_t msidx;
+ unsigned int statesize;
+
+ // variables for storing sentence-based Rank Statistics
+ int sent_Nw=0,sent_Noov=0;
+ double sent_avgRank;
+ int sent_tot_rank = 0;
+ int sent_id = 0;
+ int sent_current_dict_alternatives = 0;
+
+ std::fstream inptxt(testfile,std::ios::in);
+
+ // loop over input lines
+ char line[MAX_LINE];
+ while (inptxt.getline(line,MAX_LINE)) {
+
+ std::string line_str = line;
+
+ VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);
+
+ //getting sentence string;
+ std::string tmp_sentence;
+ std::string sentence;
+ std::string context;
+ std::string sentence_lexiconfile;
+
+ bool withLexicon = ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+ bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+
+ //getting apriori topic weights
+ topic_map_t apriori_topic_map;
+ if (withContext){
+ ((lmContextDependent*) lmt)->setContextMap(apriori_topic_map,context);
+ }
+
+ // computation using std::string
+ // loop over ngrams of the sentence
+ string_vec_t word_vec;
+ split(sentence, ' ', word_vec);
+
+ //first points to the last recent term to take into account
+ //last points to the position after the most recent term to take into account
+ //last could point outside the vector of string; do NOT use word_vec.at(last)
+ size_t last, first;
+ size_t size=0;
+ size_t order = lmt->maxlevel();
+
+ std::stringstream rank_outstr;
+
+ for (size_t word_pos=0; word_pos<word_vec.size(); ++word_pos){
+ ++size;
+ size=(size<order)?size:order;
+ last=word_pos+1;
+
+ // reset ngram at begin of sentence
+ if (word_vec.at(word_pos) == lmt->getDict()->BoS()) {
+ size=1;
+ continue;
+ }
+ first = last - size;
+
+ string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+
+ if (size>=1) {
+
+ VERBOSE(2,"computing rank for first:|" << first << "| and last:|" << last << "|" << std::endl);
+
+ VERBOSE(2,"word_vec.at(word_pos):|" << word_vec.at(word_pos) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
+
+ if (lmt->getDict()->encode(word_vec.at(word_pos).c_str()) == lmt->getDict()->oovcode()) {
+ Noov++;
+ sent_Noov++;
+ }
+ Nw++;
+ sent_Nw++;
+
+ if ((Nw % 100000)==0) {
+ std::cerr << ".";
+ lmt->check_caches_levels();
+ }
+
+ VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);
+ VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);
+ string_vec_t::iterator it=tmp_word_vec.end()-1;
+
+ int current_pos = tmp_word_vec.size()-1;
+ std::string current_word = tmp_word_vec.at(current_pos);
+
+ double current_Pr;
+ if (withContext){
+ current_Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize);
+ }else{
+ current_Pr=lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize);
+ }
+
+ //loop over a set of selected alternative words
+ //populate the dictionary with all words associated with the current word
+
+ dictionary* current_dict;
+ if (add_full_dictionary){
+ //loop over all words in the LM
+ current_dict = lmt->getDict();
+ }else{
+ current_dict = new dictionary((char *)NULL,1000000);
+ }
+ current_dict->incflag(1);
+
+ current_dict->encode(current_word.c_str());
+
+ VERBOSE(2,"after current word current_dict->size:" << current_dict->size() << std::endl);
+
+ //add words from the lexicon
+ if (add_lexicon_words){
+
+ if (withLexicon){
+ lexicon.clear();
+ load_lexicon(sentence_lexiconfile.c_str(), lexicon);
+ }
+
+ std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
+ for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
+ {
+ current_dict->encode((it->second).c_str());
+ /*
+ //exclude the current word from the selected alternative words
+ if (current_word != (it->second).c_str()){
+ current_dict->encode((it->second).c_str());
+ }
+ */
+ }
+ }
+ VERBOSE(2,"after add_lexicon_words current_dict->size:" << current_dict->size() << std::endl);
+
+ if (add_lm_words){
+ bool succ_flag=false;
+ ngram hg(lmt->getDict());
+
+ if (size==1) {
+ hg.pushw(lmt->getDict()->BoS());
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+ }else if (size>=2) {
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+
+ if (!succ_flag && size>=3){
+ hg.size=0;
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-3));
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+ }
+ }
+
+
+ if (succ_flag){
+ ngram ng=hg;
+ lmt->succscan(hg,ng,LMT_INIT,ng.size);
+ while(lmt->succscan(hg,ng,LMT_CONT,ng.size)) {
+ current_dict->encode(ng.dict->decode(*ng.wordp(1)));
+ }
+ }
+
+ }
+
+ VERBOSE(2,"after add_lm_words current_dict->size:" << current_dict->size() << std::endl);
+
+ if (add_sentence_words){
+ for (string_vec_t::const_iterator it=word_vec.begin(); it!=word_vec.end(); ++it)
+ {
+ current_dict->encode(it->c_str());
+ }
+ }
+ current_dict->incflag(0);
+ VERBOSE(2,"after add_sentence_words current_dict->size:" << current_dict->size() << std::endl);
+
+ sent_current_dict_alternatives += current_dict->size();
+ current_dict_alternatives += current_dict->size();
+
+ VERBOSE(2,"current_dict->size:" << current_dict->size() << std::endl);
+ for (int h=0;h<current_dict->size();++h){
+ VERBOSE(2,"h:" << h << " w:|" << current_dict->decode(h) << "|" << std::endl);
+ }
+
+ //the first word in current_dict is always the current_word; hence we can skip it during the scan
+ //variables for the computation of the ranking
+ max_rank = current_dict->size(); //the current word is included in the selected alternative words
+ int current_rank = 1;
+ // for (int j=0; j<current_dict->size(); ++j){
+ for (int j=1; j<current_dict->size(); j++)
+ {
+ tmp_word_vec.at(current_pos) = current_dict->decode(j);
+ IFVERBOSE(3){
+ std::cout << "tmp_word_vec j:" << j;
+ for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
+ std::cout << " |" << (*it2) << "|";
+ }
+ std::cout << std::endl;
+ }
+
+ double pr;
+ if (withContext){
+ pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize);
+ }else{
+ pr=lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize);
+ }
+ if (pr > current_Pr){
+ ++current_rank;
+ }
+
+ VERBOSE(3," current_pos:" << current_pos << " word:|" << tmp_word_vec.at(current_pos) << "| current_Pr:" << current_Pr << " pr:" << pr << " current_rank:" << current_rank <<std::endl);
+ }
+
+ sent_tot_rank += current_rank;
+ tot_rank += current_rank;
+
+ if (debug>1){
+ //output format:
+ //word_pos:current_rank:max_rank
+ rank_outstr << " " << word_pos << ":" << current_rank << ":" << max_rank;
+ }
+ delete current_dict;
+ }
+ }
+
+ if (sent_flag) {
+ if (debug>1){
+ VERBOSE(1," sent_tot_rank:" << sent_tot_rank << " sent_Nw:" << sent_Nw << std::endl);
+ //output format: a blank-separated list of triplets
+ //current_pos:current_rank:max_rank
+ std::cout << "sent_id=" << sent_id << " ranking= " << rank_outstr.str() << std::endl;
+ }
+
+ sent_avgRank = ((double) sent_tot_rank) / sent_Nw;
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_avgRank=" << sent_avgRank
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
+ std::cout.flush();
+
+ //reset statistics for sentence based avg Ranking
+ sent_Nw = 0;
+ sent_Noov = 0;
+ sent_tot_rank = 0;
+ ++sent_id;
+ sent_current_dict_alternatives = 0;
+ }
+ }
+
+ avgRank = ((double) tot_rank) / Nw;
+
+ std::cout << "%% Nw=" << Nw
+ << " avgRank=" << avgRank
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
+ std::cout.flush();
+
+ if (debug>1) lmt->used_caches();
+
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+ }
+}
+
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
new file mode 100644
index 0000000..0f2af82
--- /dev/null
+++ b/src/context-similarity.cpp
@@ -0,0 +1,559 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include "ngramtable.h"
+#include "lmContainer.h"
+#include "context-similarity.h"
+#include "util.h"
+#include "mfstream.h"
+
+using namespace std;
+
+inline void error(const char* message)
+{
+ std::cerr << message << "\n";
+ throw std::runtime_error(message);
+}
+
+namespace irstlm {
+
+ ContextSimilarity::ContextSimilarity(const std::string &k_modelfile, const std::string &hk_modelfile, const std::string &hwk_modelfile)
+ {
+ m_hwk_order=3;
+ m_hk_order=2;
+ m_wk_order=m_hk_order;
+ m_k_order=1;
+ m_hwk_ngt=new ngramtable((char*) hwk_modelfile.c_str(), m_hwk_order, NULL,NULL,NULL);
+ m_hk_ngt=new ngramtable((char*) hk_modelfile.c_str(), m_hk_order, NULL,NULL,NULL);
+ m_wk_ngt=m_hk_ngt; //just a link to m_hk_ngt
+ m_k_ngt=new ngramtable((char*) k_modelfile.c_str(), m_k_order, NULL,NULL,NULL);
+
+ m_smoothing = 0.001;
+ m_threshold_on_h = 0;
+ m_active=true;
+
+ m_topic_size = m_k_ngt->getDict()->size();
+ VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
+
+#ifdef MY_ASSERT_FLAG
+ VERBOSE(0, "MY_ASSERT is active" << std::endl);
+#else
+ VERBOSE(0, "MY_ASSERT is NOT active" << std::endl);
+#endif
+
+ }
+
+
+ ContextSimilarity::~ContextSimilarity()
+ {
+ delete m_hwk_ngt;
+ delete m_hk_ngt;
+ //delete m_wk_ngt; //it is just a link to m_hk_ngt
+ delete m_k_ngt;
+ }
+
+ void ContextSimilarity::normalize_topic_scores(topic_map_t& map)
+ {
+ UNUSED(map);
+ /* normalization type 1
+ double max = -1000000.0;
+ double min = 1000000.0;
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+ min = (map[it->first]<min)?map[it->first]:min;
+ max = (map[it->first]>max)?map[it->first]:max;
+ }
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+ map[it->first] = (map[it->first]-min)/(max-min);
+ }
+ VERBOSE(2,"min:"<<min << " max:" << max << std::endl);
+ */
+ /*
+ //normalization type 2
+ double norm = 0.0;
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+ norm += fabs(map[it->first]);
+ }
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+ map[it->first] = map[it->first]/norm;
+ }
+ VERBOSE(2,"norm:" << norm << std::endl);
+ */
+ }
+
+ double ContextSimilarity::DeltaCrossEntropy(topic_map_t& topic_map, topic_map_t& tmp_map, double len)
+ {
+ double xDeltaEntropy = 0.0;
+ for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+ xDeltaEntropy += topic_map[it->first] * tmp_map[it->first];
+ // VERBOSE(2,"topic_map[it->first]:" << topic_map[it->first] << " tmp_map[it->first]:" << tmp_map[it->first] << " product:" << topic_map[it->first] * tmp_map[it->first] << std::endl);
+ }
+ // VERBOSE(2," xDeltaEntropy:" << xDeltaEntropy << " len:" << len << " xDeltaEntropy/len:" << xDeltaEntropy/len << std::endl);
+ return xDeltaEntropy/len;
+ }
+
+ void ContextSimilarity::add_topic_scores(topic_map_t& topic_map, topic_map_t& tmp_map)
+ {
+ for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+ topic_map[it->first] += tmp_map[it->first];
+ }
+ }
+
+ //returns the scores for all topics in the topic models (without apriori topic prob)
+ void ContextSimilarity::print_topic_scores(topic_map_t& map)
+ {
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+ {
+ if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+ std::cout << it->first << topic_map_delimiter2 << it->second;
+ // std::cout << it->first << topic_map_delimiter2 << exp(it->second * M_LN10);
+ }
+
+ std::cout << std::endl;
+ }
+
+ void ContextSimilarity::print_topic_scores(topic_map_t& map, topic_map_t& refmap, double len)
+ {
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+ {
+ if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+ std::cout << it->first << topic_map_delimiter2 << it->second;
+ }
+ std::cout << " DeltaCrossEntropy:" << DeltaCrossEntropy(refmap,map,len);
+ std::cout << std::endl;
+ }
+
+ void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& ng)
+ {
+ //text is a vector of strings with w in the last position and the history in the previous positions
+ //text must have at least one word
+ //if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(w,k), counts(h,k), counts(k)
+ //if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(w), counts(k), counts()
+ VERBOSE(2,"void ContextSimilarity::create_ngram" << std::endl);
+ VERBOSE(2,"text.size:" << text.size() << std::endl);
+
+ MY_ASSERT(text.size()>0);
+
+ if (text.size()==1){
+ //all further computation will rely on lower-order counts
+ ng.pushw(text.at(text.size()-1));
+ }else {
+ ng.pushw(text.at(text.size()-2));
+ ng.pushw(text.at(text.size()-1));
+ }
+ VERBOSE(2,"output of create_ngram ng:|" << ng << "| ng.size:" << ng.size << std::endl);
+ }
+
+ void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& ng)
+ {
+ //text is a vector of string with w in the last position and the history in the previous positions
+ //text must have at least one word
+ //topic is added in the most recent position of the ngram
+ create_ngram(text, ng);
+ add_topic(topic, ng);
+ VERBOSE(2,"output of create_topic_ngram ng:|" << ng << "| ng.size:" << ng.size << std::endl);
+ }
+
+ void ContextSimilarity::add_topic(const std::string& topic, ngram& ng)
+ {
+ ng.pushw(topic);
+ }
+
+ void ContextSimilarity::modify_topic(const std::string& topic, ngram& ng)
+ {
+ *ng.wordp(1) = ng.dict->encode(topic.c_str());
+ }
+
+ void ContextSimilarity::get_counts(ngram& ng, ngramtable& ngt, double& c_xk, double& c_x)
+ {
+ VERBOSE(2, "double ContextSimilarity::get_counts(ngram& ng, double& c_xk, double& c_x) with ng:|" << ng << "|" << std::endl);
+ //counts taken from the tables are modified to avoid zero values for the probs
+ //a constant epsilon (smmothing) is added
+ //we also assume that c(x) = sum_k c(xk)
+
+ //we assume that ng ends with a correct topic
+ //we assume that ng is compliant with ngt, and has the correct size
+
+ c_xk = m_smoothing;
+ c_x = m_smoothing * m_topic_size;
+
+ if (ngt.get(ng)) { c_xk += ng.freq; }
+ if (ngt.get(ng,ng.size,ng.size-1)) { c_x += ng.freq; }
+
+ VERBOSE(3, "c_xk:" << c_xk << " c_x:" << c_x << std::endl);
+ }
+
+ double ContextSimilarity::topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2)
+ {
+ ngram ng(ngt.getDict());
+
+ create_topic_ngram(text, topic, ng);
+
+ return topic_score(ng, ngt, ngt2);
+ }
+
+ double ContextSimilarity::topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2){
+#ifdef OPTION_1
+ return topic_score_option1(ng, ngt, ngt2);
+#elif OPTION_2
+ return topic_score_option2(ng, ngt, ngt2);
+#elif OPTION_3
+ return topic_score_option3(ng, ngt, ngt2);
+#else
+ return topic_score_option0(ng, ngt, ngt2);
+#endif
+ }
+
+ double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+ {
+ UNUSED(ngt);
+ UNUSED(ngt2);
+ VERBOSE(2, "double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+
+ //option 0: uniform (not considering log function)
+ //P(k|hw) = 1/number_of_topics
+ double log_pr = -log(m_topic_size)/M_LN10;
+
+ VERBOSE(3, "option0: return: " << log_pr<< std::endl);
+ return log_pr;
+ }
+
+ double ContextSimilarity::topic_score_option1(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+ {
+ VERBOSE(2, "double ContextSimilarity::topic_score_option1(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+
+ //ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+ double c_xk, c_x;
+ get_counts(ng, ngt, c_xk, c_x);
+
+ //copy and transform codes
+ // shift all terms, but the topic
+ // ng2[3]=ng[4];
+ // ng2[2]=ng[3];
+ // ng2[1]=ng[1];
+ ngram ng2(ngt2.getDict());
+ ng2.trans(ng);
+ ng2.shift();
+ *ng2.wordp(1)=ng2.dict->encode(ng.dict->decode(*ng.wordp(1)));
+
+ //ngt2 provides counts c(hk) and c(h) (or c(k) and c())
+ double c_xk2, c_x2;
+ get_counts(ng2, ngt2, c_xk2, c_x2);
+
+ //option 1: (not considering log function)
+ //P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ num_pr/den_pr
+ //num_pr = c'(hwk)/c'(hw)
+ //den_pr = c'(hk)/c'(h)
+ double den_log_pr = log10(c_xk2) - log10(c_x2);
+ double num_log_pr = log10(c_xk) - log10(c_x);
+ double log_pr = num_log_pr - den_log_pr;
+ VERBOSE(3, "option1: num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return: " << log_pr << std::endl);
+ return log_pr;
+ }
+
+ double ContextSimilarity::topic_score_option2(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+ {
+ UNUSED(ngt2);
+ VERBOSE(2, "double ContextSimilarity::topic_score_option2(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+
+ //ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+ double c_xk, c_x;
+ get_counts(ng, ngt, c_xk, c_x);
+
+ //option 1: (not considering log function)
+ //P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ c'(hwk)/c'(hw)
+ double log_pr = log10(c_xk) - log10(c_x);
+ VERBOSE(3, "option2: log_pr:" << log_pr << " return: " << log_pr << std::endl);
+ return log_pr;
+ }
+
+ double ContextSimilarity::topic_score_option3(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+ {
+ VERBOSE(2, "double ContextSimilarity::topic_score_option3(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+
+ //ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+ double c_xk, c_x;
+ get_counts(ng, ngt, c_xk, c_x);
+
+ //copy and transform codes
+ // shift all terms, but the topic
+ // ng2[3]=ng[4];
+ // ng2[2]=ng[3];
+ // ng2[1]=ng[1];
+ ngram ng2(ngt2.getDict());
+ ng2.trans(ng);
+ ng2.shift();
+ *ng2.wordp(1)=ng2.dict->encode(ng.dict->decode(*ng.wordp(1)));
+
+ //ngt2 provides counts c(hk) and c(h) (or c(k) and c())
+ double c_xk2, c_x2;
+ get_counts(ng2, ngt2, c_xk2, c_x2);
+
+ /*;
+ //approximation 3: (not considering log function)
+ //P(k|hw)/sum_v P(k|hv) ~approx~ logistic_function(P(k|hw)/P(k|h))
+ // ~approx~ logistic_function(num_pr/den_pr)
+ // ~approx~ logistic_function(c'(hwk)/c'(hw)/c'(hk)/c'(h))
+ // ~approx~ logistic_function((c'(hwk)*c'(h))/(c'(hw)*c'(hk)))
+
+ return logistic_function((c'(hwk)*c'(h))/(c'(hw)*c'(hk)),1.0,1.0)
+ */
+
+ double log_pr = logistic_function((c_xk*c_x2)/(c_x*c_xk2),1.0,1.0);
+
+ VERBOSE(3, "option3: return: " << log_pr << std::endl);
+ return log_pr;
+ }
+
+ double ContextSimilarity::total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict)
+ {
+ ngram ng(ngt.getDict());
+ create_topic_ngram(text, topic, ng);
+ return total_topic_score(ng, ngt, ngt2, dict);
+ }
+
+ double ContextSimilarity::total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict)
+ {
+ double tot_pr = 0.0;
+ double v_topic_pr;
+ for (int v=0; v<dict.size(); ++v){
+ //replace last word, which is in position 2, keeping topic in position 1 unchanged
+ *ng.wordp(2) = ng.dict->encode(dict.decode(v));
+ v_topic_pr = topic_score(ng, ngt, ngt2);
+ tot_pr += pow(10.0,v_topic_pr); //v_pr is a lo10 prob
+ }
+ return log10(tot_pr);
+ }
+
+ double ContextSimilarity::total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight)
+ {
+ ngram ng(ngt.getDict());
+ create_topic_ngram(text, topic, ng);
+ return total_topic_score(ng, ngt, ngt2, dict, lm, weight);
+ }
+
+ double ContextSimilarity::total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight)
+ {
+ double tot_pr = 0.0;
+ double v_pr, v_topic_pr, v_lm_pr;
+ for (int v=0; v<dict.size(); ++v){
+ //replace last word, which is in position 2, keeping topic in position 1 unchanged
+ *ng.wordp(2) = ng.dict->encode(dict.decode(v));
+ v_topic_pr = topic_score(ng, ngt, ngt2);
+ v_lm_pr = lm.clprob(ng);
+ v_pr = v_lm_pr + weight * v_topic_pr;
+ tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+ }
+ return log10(tot_pr);
+ }
+
+ void ContextSimilarity::modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+ {
+ ngram ng(ngt.getDict());
+ create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+
+ modify_context_map(ng, ngt, ngt2, dict, topic_weights, mod_topic_weights);
+ }
+
+ void ContextSimilarity::modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+ {
+ double global_score;
+ double mod_topic_pr;
+ for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+ {
+ modify_topic(it->first, ng);
+ global_score = total_topic_score(ng, ngt, ngt2, dict);
+ global_score = pow(10.0,global_score);
+ mod_topic_pr = it->second/global_score;
+ mod_topic_weights.insert(make_pair(it->first,mod_topic_pr));
+ }
+ }
+
+ void ContextSimilarity::modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+ {
+ ngram ng(ngt.getDict());
+ create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+
+ modify_context_map(ng, ngt, ngt2, dict, lm, weight, topic_weights, mod_topic_weights);
+ }
+
+ void ContextSimilarity::modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+ {
+ double global_score;
+ double mod_topic_pr;
+ for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+ {
+ modify_topic(it->first, ng);
+ global_score = total_topic_score(ng, ngt, ngt2, dict, lm, weight);
+ global_score = pow(10.0,global_score);
+ mod_topic_pr = it->second/global_score;
+ mod_topic_weights.insert(make_pair(it->first,mod_topic_pr));
+ }
+ }
+
+
+ double ContextSimilarity::context_similarity(string_vec_t& text, topic_map_t& topic_weights)
+ {
+#ifdef SOLUTION_1
+ return context_similarity_solution1(text, topic_weights);
+#elif SOLUTION_2
+ return context_similarity_solution2(text, topic_weights);
+#else
+ UNUSED(text);
+ UNUSED(topic_weights);
+ VERBOSE(3, "This solution type is not defined; forced to default solution 1" << std::endl);
+ return context_similarity_solution1(text, topic_weights);
+// exit(IRSTLM_CMD_ERROR_GENERIC);
+#endif
+ }
+
+ //return the log10 of the similarity score
+ double ContextSimilarity::context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights)
+ {
+ VERBOSE(2, "double ContextSimilarity::context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+ double ret_log10_pr = 0.0;
+
+ if (!m_active){
+ //similarity score is disable
+ //return an uninforming score (log(1.0) = 0.0)
+ ret_log10_pr = 0.0;
+ }
+ else if (topic_weights.size() == 0){
+ //a-priori topic distribution is "empty", i.e. there is no score for any topic
+ //return an uninforming score (log(1.0) = 0.0)
+ ret_log10_pr = 0.0;
+ }
+ else{
+ VERBOSE(3, "topic_weights.size():" << topic_weights.size() << std::endl);
+
+ ngramtable* current_ngt;
+ ngramtable* current_ngt2;
+
+ if (text.size()==1){
+ current_ngt = m_wk_ngt;
+ current_ngt2 = m_k_ngt;
+ }
+ else{
+ current_ngt = m_hwk_ngt;
+ current_ngt2 = m_hk_ngt;
+ }
+
+ ngram ng(current_ngt->getDict());
+ create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+
+
+ if (reliable(ng, current_ngt)){
+ //this word sequence is reliable
+
+ double ret_pr = 0.0;
+ for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+ {
+ ngram current_ng = ng;
+ modify_topic(it->first, current_ng);
+
+ double apriori_topic_score = it->second; //prob
+ double current_topic_score = exp(topic_score(current_ng, *current_ngt, *current_ngt2) * M_LN10); //topic_score(...) returns a log10; hence exp is applied to (score * M_LN10)
+
+ VERBOSE(3, "current_ng:|" << current_ng << "| topic:|" << it->first << "| apriori_topic_score:" << apriori_topic_score << " topic_score:" << current_topic_score << " score_toadd:" << ret_pr << std::endl);
+ ret_pr += apriori_topic_score * current_topic_score;
+ VERBOSE(3, "CURRENT ret_pr:" << ret_pr << std::endl);
+ }
+ ret_log10_pr = log10(ret_pr);
+ }
+ else{
+ //this word sequence is not reliable enough, because occurrences of ng are too little
+ //return an uninforming score (log10(1/K) = -log10(K))
+ ret_log10_pr = -log(m_topic_size)/M_LN10;
+// ret_log10_pr = 0.0;
+ VERBOSE(3, "CURRENT ret_pr:" << pow(10.0,ret_log10_pr) << std::endl);
+ }
+
+ }
+ VERBOSE(2, "ret_log10_pr:" << ret_log10_pr << std::endl);
+ return ret_log10_pr;
+ }
+
+ //return the log10 of the similarity score
+ double ContextSimilarity::context_similarity_solution2(string_vec_t& text, topic_map_t& topic_weights)
+ {
+ return context_similarity_solution1(text, topic_weights);
+ }
+
+ bool ContextSimilarity::reliable(ngram& ng, ngramtable* ngt)
+ {
+ VERBOSE(2, "ContextSimilarity::reliable(ngram& ng, ngramtable* ngt) ng:|" << ng << "| ng.size:" << ng.size<< "| thr:" << m_threshold_on_h << std::endl);
+
+ bool ret=false;
+
+ if (ngt->get(ng,ng.size,ng.size-1) && (ng.freq > m_threshold_on_h)){
+ ret=true;
+ }else{
+ ret=false;
+ }
+ VERBOSE(3, "ng:|" << ng << "| thr:" << m_threshold_on_h << " reliable:" << ret << std::endl);
+ return ret;
+ }
+
+
+ //returns the scores for all topics in the topic models (without apriori topic prob)
+ void ContextSimilarity::get_topic_scores(string_vec_t& text, topic_map_t& topic_map)
+ {
+ if (m_active){ //similarity score is disable
+ ngramtable* current_ngt;
+ ngramtable* current_ngt2;
+
+ if (text.size()==1){
+ current_ngt = m_wk_ngt;
+ current_ngt2 = m_k_ngt;
+ }
+ else{
+ current_ngt = m_hwk_ngt;
+ current_ngt2 = m_hk_ngt;
+ }
+
+ ngram ng(current_ngt->getDict());
+ create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+
+ get_topic_scores(ng, *current_ngt, *current_ngt2, topic_map);
+ }
+ }
+
+
+ //returns the scores for all topics in the topic models (without apriori topic prob)
+ void ContextSimilarity::get_topic_scores(ngram& ng, ngramtable& ngt, ngramtable& ngt2, topic_map_t& topic_map)
+ {
+ if (m_active){ //similarity score is disable
+ for (int i=0; i<m_k_ngt->getDict()->size();++i)
+ {
+ std::string _topic = m_k_ngt->getDict()->decode(i);
+ modify_topic(_topic, ng);
+ topic_map[_topic] = pow(10.0,topic_score(ng, ngt, ngt2));
+ }
+ }
+ }
+
+}//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
new file mode 100644
index 0000000..5e67553
--- /dev/null
+++ b/src/context-similarity.h
@@ -0,0 +1,138 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+
+#ifndef MF_CONTEXTSIMILARITY_H
+#define MF_CONTEXTSIMILARITY_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <math.h>
+#include "cmd.h"
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "ngramtable.h"
+#include "lmContainer.h"
+
+class ngram;
+
+namespace irstlm {
+#define topic_map_delimiter1 ':'
+#define topic_map_delimiter2 ','
+#define SIMILARITY_LOWER_BOUND -10000
+
+ class ContextSimilarity
+ {
+ private:
+ ngramtable* m_hwk_ngt; // counts(h, w, topic)
+ ngramtable* m_hk_ngt; // counts(h, topic)
+ ngramtable* m_wk_ngt; // counts(w, topic)
+ ngramtable* m_k_ngt; // counts(topic)
+ int m_k_order; //order of m_k_ngt
+ int m_hk_order; //order of m_hk_ngt
+ int m_wk_order; //order of m_wk_ngt
+ int m_hwk_order; //order of m_hwk_ngt
+
+ int m_topic_size; //number of topics in the model
+
+ topic_map_t topic_map;
+ int m_threshold_on_h; //frequency threshold on h to allow computation of similairty scores
+ double m_smoothing; //smoothing value to sum to the counts to avoid zero-prob; implements a sort of shift-beta smoothing
+
+ //flag for enabling/disabling context_similarity scores
+ // if disabled, context_similarity is 0.0 and topic_scores distribution is empty
+ bool m_active;
+
+ void create_ngram(const string_vec_t& text, ngram& ng);
+ void add_topic(const std::string& topic, ngram& ng);
+ void modify_topic(const std::string& topic, ngram& ng);
+ void create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& ng);
+
+ void get_counts(ngram& ng, ngramtable& ngt, double& c_xk, double& c_x);
+
+ double topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2);
+ double topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+ double topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+ double topic_score_option1(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+ double topic_score_option2(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+ double topic_score_option3(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+
+ double total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict);
+ double total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight);
+ double total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict);
+ double total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight);
+
+
+
+ void modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+ void modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+ void modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+ void modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+
+ double context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights);
+ double context_similarity_solution2(string_vec_t& text, topic_map_t& topic_weights);
+
+ bool reliable(ngram& ng, ngramtable* ngt);
+
+ public:
+ ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
+ ~ContextSimilarity();
+
+ void get_topic_scores(string_vec_t& text, topic_map_t& topic_map);
+ void get_topic_scores(ngram& ng, ngramtable& ngt, ngramtable& ngt2, topic_map_t& topic_map);
+
+ void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
+ void print_topic_scores(topic_map_t& map);
+ void print_topic_scores(topic_map_t& map, topic_map_t& refmap, double len);
+ double DeltaCrossEntropy(topic_map_t& topic_map, topic_map_t& tmp_map, double len);
+
+ void normalize_topic_scores(topic_map_t& map);
+
+ double context_similarity(string_vec_t& text, topic_map_t& topic_weights);
+
+ int get_Threshold_on_H(){
+ return m_threshold_on_h;
+ }
+ void set_Threshold_on_H(int val){
+ m_threshold_on_h = val;
+ }
+ double get_SmoothingValue(){
+ return m_smoothing;
+ }
+ void set_SmoothingValue(double val){
+ m_smoothing = val;
+ }
+ bool is_Active(){
+ return m_active;
+ }
+ void set_Active(bool val){
+ m_active = val;
+ }
+
+ };
+}
+
+
+#endif
+
diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index afdd77c..654a064 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -32,6 +32,7 @@
#include "lmmacro.h"
#include "lmclass.h"
#include "lmInterpolation.h"
+#include "lmContextDependent.h"
using namespace std;
@@ -94,6 +95,8 @@ namespace irstlm {
VERBOSE(1,"type: " << type << std::endl);
if (header == "lmminterpolation" || header == "LMINTERPOLATION") {
type = _IRSTLM_LMINTERPOLATION;
+ } else if (header == "lmcontextdependent" || header == "LMCONTEXTDEPENDENT") {
+ type = _IRSTLM_LMCONTEXTDEPENDENT;
} else if (header == "lmmacro" || header == "LMMACRO") {
type = _IRSTLM_LMMACRO;
} else if (header == "lmclass" || header == "LMCLASS") {
@@ -142,6 +145,11 @@ namespace irstlm {
VERBOSE(1,"_IRSTLM_LMINTERPOLATION" << std::endl);
lm = new lmInterpolation(nlf, dlf);
break;
+
+ case _IRSTLM_LMCONTEXTDEPENDENT:
+ VERBOSE(1,"_IRSTLM_LMCONTEXTDEPENDENT" << std::endl);
+ lm = new lmContextDependent(nlf, dlf);
+ break;
default:
VERBOSE(1,"UNKNOWN" << std::endl);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list