[irstlm] 52/126: code improvement

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:44 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit dad103563fbe15106dfb8e7fb94dcfa2612ba4cb
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Wed Aug 19 18:15:55 2015 +0200

    code improvement
---
 src/context-similarity.cpp | 99 ++++++++++++++++++++++++++--------------------
 src/context-similarity.h   |  4 +-
 2 files changed, 59 insertions(+), 44 deletions(-)

diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index a474d72..62623df 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -56,6 +56,13 @@ namespace irstlm {
 		
 		m_topic_size = m_k_ngt->getDict()->size();
 		VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
+		
+#ifdef MY_ASSERT_FLAG
+		VERBOSE(0, "MY_ASSERT is active" << std::endl);
+#else
+		VERBOSE(0, "MY_ASSERT is NOT active" << std::endl);
+#endif
+		
 	}
 
 	
@@ -84,7 +91,7 @@ namespace irstlm {
 			
 			
 			create_ngram(text, base_num_ng, base_den_ng);
-			if (den_reliable(base_den_ng)){ //we do not know about the reliability of the denominator
+			if (base_reliable(base_den_ng, 2, m_hk_ngt)){ //we do not know about the reliability of the denominator
 				
 				for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
 				{
@@ -180,8 +187,9 @@ namespace irstlm {
 		//if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(h,k), counts(k)
 		//if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(k), counts()
 		VERBOSE(2,"void ContextSimilarity::create_ngram" << std::endl);
+		VERBOSE(2,"text.size:" << text.size() << std::endl);
 
-		MY_ASSERT(text.size()==0);
+		MY_ASSERT(text.size()>0);
 		
 		if (text.size()==1){
 			//all further computation will rely on lower-order counts
@@ -228,76 +236,83 @@ namespace irstlm {
 		
 		if (den_ng.size == m_hk_order){//we rely on counts(h,k) and counts(h)
 			if (m_hk_ngt->get(den_ng)) {	c_hk += den_ng.freq; }
-			if (m_hk_ngt->get(den_ng,2,1)) { c_h += den_ng.freq; }
+			if (m_hk_ngt->get(den_ng,den_ng.size,den_ng.size-1)) { c_h += den_ng.freq; }
 		}else{//we actually rely on counts(k) and counts()
-			/*
-			 if (m_k_ngt->get(den_ng)) {	c_hk += den_ng.freq; }
-			 c_h += m_hk_ngt->getDict()->totfreq();
-			 */
 			c_hk += m_hk_ngt->getDict()->freq(*(den_ng.wordp(1)));
 			c_h += m_k_ngt->getDict()->totfreq();
 		}
 		den_log_pr = log10(c_hk) - log10(c_h);
 		VERBOSE(3, "c_hk:" << c_hk << " c_h:" << c_h << std::endl);
 		
-		
-		if (num_reliable(num_ng)){
-			if (num_ng.size == m_hwk_order){ //we rely on counts(h,w,k) and counts(h,w)
+		if (num_ng.size == m_hwk_order){ //we rely on counts(h,w,k) and counts(h,w)
+			if (reliable(num_ng, num_ng.size, m_hwk_ngt)){
 				if (m_hwk_ngt->get(num_ng)) {	c_hwk += num_ng.freq; }
-				if (m_hwk_ngt->get(num_ng,3,2)) { c_hw += num_ng.freq; }
-			}else{ //we actually rely on counts(h,k) and counts(h)
+				if (m_hwk_ngt->get(num_ng,num_ng.size,num_ng.size-1)) { c_hw += num_ng.freq; }
+			}else{
+				c_hwk=1;
+				c_hk=m_topic_size;
+			}
+		}else{ //hence num_ng.size=m_hwk_order-1, we actually rely on counts(h,k) and counts(h)
+			if (reliable(num_ng, num_ng.size, m_hk_ngt)){
 				if (m_hk_ngt->get(num_ng)) {	c_hwk += num_ng.freq; }
-				if (m_hk_ngt->get(num_ng,3,2)) { c_hw += num_ng.freq; }
+				if (m_hk_ngt->get(num_ng,num_ng.size,num_ng.size-1)) { c_hw += num_ng.freq; }
+			}else{
+				c_hwk=1;
+				c_hk=m_topic_size;
 			}
-			num_log_pr = log10(c_hwk) - log10(c_hw);
-			VERBOSE(3, "c_hwk:" << c_hwk << " c_hw:" << c_hw << std::endl);
-		}else{
-			num_log_pr = -log10(m_topic_size);
 		}
+		VERBOSE(3, "c_hwk:" << c_hwk << " c_hw:" << c_hw << std::endl);
+		num_log_pr = log10(c_hwk) - log10(c_hw);
 		
 		VERBOSE(3, "num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << std::endl);
 		return num_log_pr - den_log_pr;
 	}
 	
-	bool ContextSimilarity::num_reliable(ngram& num_ng)
+	bool ContextSimilarity::reliable(ngram& ng, int size, ngramtable* ngt)
 	{
-		VERBOSE(2, "ContextSimilarity::num_reliable(ngram& num_ng) num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << "|" << std::endl);		
-		if (num_ng.size < 2){
+		VERBOSE(2, "ContextSimilarity::reliable(ngram& ng, int size, ngramtable* ngt) ng:|" << ng << "| thr:" << m_threshold_on_h << "| ng.size:" << ng.size << " size:" << size << std::endl);	
+		
+		bool ret=false;
+		
+		if (ng.size < size){
 			//num_ng has size lower than expected (2)
 			//in this case we will rely on counts(h, topic) instead of counts(h, w, topic)
-			VERBOSE(3, "num_ng:|" << num_ng << "| has size lower than expected (2) TRUE" << std::endl);
-			return true;
+			VERBOSE(3, "ng:|" << ng << "| has size (" << ng.size<< " ) lower than expected (" << size << ")" << std::endl);
+			ret=true;
 		}
-		if (m_hwk_ngt->get(num_ng,3,2) && (num_ng.freq > m_threshold_on_h)){
-			VERBOSE(3, "num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << " TRUE" << std::endl);
-			return true;
+		
+		if (ngt->get(ng,size,size-1) && (ng.freq > m_threshold_on_h)){
+			ret=true;
 		}else{
-			VERBOSE(3, "num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << " FALSE" << std::endl);
-			return false;
+			ret=false;
 		}
+		VERBOSE(3, "ng:|" << ng << "| thr:" << m_threshold_on_h << " reliable:" << ret << std::endl);
+		return ret;
 	}
 	
-	
-	bool ContextSimilarity::den_reliable(ngram& den_ng)
+	bool ContextSimilarity::base_reliable(ngram& ng, int size, ngramtable* ngt)
 	{
-		VERBOSE(2, "ContextSimilarity::den_reliable(ngram& den_ng) den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << "|" << std::endl);
+		VERBOSE(2, "ContextSimilarity::base_reliable(ngram& ng, int size, ngramtable* ngt) ng:|" << ng << "| thr:" << m_threshold_on_h << "|" << std::endl);
 
-		if (den_ng.size < 1){
+		bool ret=false;
+		
+		if (ng.size < size){
 			//den_ng has size lower than expected (1)
 			//in this case we will rely on counts(topic) instead of counts(h, topic)
-			VERBOSE(3, "den_ng:|" << den_ng << "| has size lower than expected (1) TRUE" << std::endl);
-			return true;
+			VERBOSE(3, "ng:|" << ng << "| has size (" << ng.size<< " ) lower than expected (" << size << ")" << std::endl);
+			ret=true;
 		}
-		den_ng.pushc(0);
-		if (m_hk_ngt->get(den_ng,2,1) && (den_ng.freq > m_threshold_on_h)){
-			den_ng.shift();
-			VERBOSE(3, "den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << " TRUE" << std::endl);
-			return true;
-		}else{
-			den_ng.shift();
-			VERBOSE(3, "den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << " FALSE" << std::endl);
-			return false;
+		else{
+			ng.pushc(0);
+			if (m_hk_ngt->get(ng,size,size-1) && (ng.freq > m_threshold_on_h)){
+				ret=true;
+			}else{
+				ret=false;
+			}
+			ng.shift();
 		}
+		VERBOSE(3, "ng:|" << ng << "| thr:" << m_threshold_on_h << " reliable:" << ret << std::endl);
+		return ret;
 	}
 	
 }//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 32324c6..d26ec84 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -71,8 +71,8 @@ namespace irstlm {
 		double get_topic_similarity(string_vec_t text, const std::string& topic);
 		double get_topic_similarity(ngram& num_ng, ngram& den_ng);
 		
-		bool num_reliable(ngram& num_ng);
-		bool den_reliable(ngram& den_ng);
+		bool reliable(ngram& ng, int size, ngramtable* ngt);
+		bool base_reliable(ngram& ng, int size, ngramtable* ngt);
 		
 	public:
 		ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list