[DRE-commits] [ruby-classifier-reborn] 06/08: Import test from upstream
Youhei SASAKI
uwabami-guest at moszumanska.debian.org
Thu Aug 31 01:34:51 UTC 2017
This is an automated email from the git hooks/post-receive script.
uwabami-guest pushed a commit to branch master
in repository ruby-classifier-reborn.
commit 8c8baa1df0209a0d98ebd7a741862b850977db89
Author: Youhei SASAKI <uwabami at gfd-dennou.org>
Date: Thu Aug 31 10:31:27 2017 +0900
Import test from upstream
Signed-off-by: Youhei SASAKI <uwabami at gfd-dennou.org>
---
.../patches/0002-Import-test-from-upstream.patch | 497 +++++++++++++++++++++
debian/patches/series | 1 +
2 files changed, 498 insertions(+)
diff --git a/debian/patches/0002-Import-test-from-upstream.patch b/debian/patches/0002-Import-test-from-upstream.patch
new file mode 100644
index 0000000..2a3c29c
--- /dev/null
+++ b/debian/patches/0002-Import-test-from-upstream.patch
@@ -0,0 +1,497 @@
+From: Youhei SASAKI <uwabami at gfd-dennou.org>
+Date: Thu, 31 Aug 2017 10:26:59 +0900
+Subject: Import test from upstream
+
+Signed-off-by: Youhei SASAKI <uwabami at gfd-dennou.org>
+---
+ test/bayes/bayesian_test.rb | 125 +++++++++++++++++++++++++
+ test/data/stopwords/en | 4 +
+ test/extensions/hasher_test.rb | 67 ++++++++++++++
+ test/lsi/lsi_test.rb | 203 +++++++++++++++++++++++++++++++++++++++++
+ test/lsi/word_list_test.rb | 33 +++++++
+ test/test_helper.rb | 8 ++
+ 6 files changed, 440 insertions(+)
+ create mode 100755 test/bayes/bayesian_test.rb
+ create mode 100644 test/data/stopwords/en
+ create mode 100644 test/extensions/hasher_test.rb
+ create mode 100644 test/lsi/lsi_test.rb
+ create mode 100644 test/lsi/word_list_test.rb
+ create mode 100644 test/test_helper.rb
+
+diff --git a/test/bayes/bayesian_test.rb b/test/bayes/bayesian_test.rb
+new file mode 100755
+index 0000000..f2f355b
+--- /dev/null
++++ b/test/bayes/bayesian_test.rb
+@@ -0,0 +1,125 @@
++# encoding: utf-8
++
++require File.dirname(__FILE__) + '/../test_helper'
++class BayesianTest < Minitest::Test
++ def setup
++ @classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
++ end
++
++ def test_good_training
++ assert_equal ['love'], @classifier.train_interesting('love')
++ end
++
++ def test_training_with_utf8
++ assert_equal ['Água'], @classifier.train_interesting('Água')
++ end
++
++ def test_stemming_enabled_by_default
++ assert @classifier.stemmer_enabled?
++ end
++
++ def test_bad_training
++ assert_raises(StandardError) { @classifier.train_no_category 'words' }
++ end
++
++ def test_bad_method
++ assert_raises(NoMethodError) { @classifier.forget_everything_you_know '' }
++ end
++
++ def test_categories
++ assert_equal %w(Interesting Uninteresting).sort, @classifier.categories.sort
++ end
++
++ def test_categories_from_array
++ another_classifier = ClassifierReborn::Bayes.new %w(Interesting Uninteresting)
++ assert_equal another_classifier.categories.sort, @classifier.categories.sort
++ end
++
++ def test_add_category
++ @classifier.add_category 'Test'
++ assert_equal %w(Test Interesting Uninteresting).sort, @classifier.categories.sort
++ end
++
++ def test_dynamic_category_succeeds_with_auto_categorize
++ classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', auto_categorize: true
++ classifier.train('Ruby', 'I really sweet language')
++ assert classifier.categories.include?('Ruby')
++ end
++
++ def test_dynamic_category_fails_without_auto_categorize
++ assert_raises(ClassifierReborn::Bayes::CategoryNotFoundError) do
++ @classifier.train('Ruby', 'A really sweet language')
++ end
++ refute @classifier.categories.include?('Ruby')
++ end
++
++ def test_classification
++ @classifier.train_interesting 'here are some good words. I hope you love them'
++ @classifier.train_uninteresting 'here are some bad words, I hate you'
++ assert_equal 'Uninteresting', @classifier.classify('I hate bad words and you')
++ end
++
++ def test_classification_with_threshold
++ b = ClassifierReborn::Bayes.new 'Digit'
++ assert_equal 1, b.categories.size
++
++ refute b.threshold_enabled?
++ b.enable_threshold
++ assert b.threshold_enabled?
++ assert_equal 0.0, b.threshold # default
++
++ b.threshold = -7.0
++
++ 10.times do |a_number|
++ b.train_digit(a_number.to_s)
++ b.train_digit(a_number.to_s)
++ end
++
++ 10.times do |a_number|
++ assert_equal 'Digit', b.classify(a_number.to_s)
++ end
++
++ refute b.classify('xyzzy')
++ end
++
++ def test_classification_with_threshold_again
++ b = ClassifierReborn::Bayes.new 'Normal'
++ assert_equal 1, b.categories.size
++
++ refute b.threshold_enabled?
++ b.enable_threshold
++ assert b.threshold_enabled?
++ assert_equal 0.0, b.threshold # default
++
++ %w(
++ http://example.com/about
++ http://example.com/contact
++ http://example.com/download
++ http://example.com/login
++ http://example.com/logout
++ http://example.com/blog/2015-04-01
++ ).each do |url|
++ b.train_normal(url)
++ end
++
++ assert 'Normal', b.classify('http://example.com')
++ refute b.classify("http://example.com/login/?user='select * from users;'")
++ end
++
++ def test_classification_with_score
++ @classifier.train_interesting 'here are some good words. I hope you love them'
++ @classifier.train_uninteresting 'here are some bad words, I hate you'
++ assert_in_delta(-4.85, @classifier.classify_with_score('I hate bad words and you')[1], 0.1)
++ end
++
++ def test_untrain
++ @classifier.train_interesting 'here are some good words. I hope you love them'
++ @classifier.train_uninteresting 'here are some bad words, I hate you'
++ @classifier.add_category 'colors'
++ @classifier.train_colors 'red orange green blue seven'
++ classification_of_bad_data = @classifier.classify 'seven'
++ @classifier.untrain_colors 'seven'
++ classification_after_untrain = @classifier.classify 'seven'
++ refute_equal classification_of_bad_data, classification_after_untrain
++ end
++end
+diff --git a/test/data/stopwords/en b/test/data/stopwords/en
+new file mode 100644
+index 0000000..271c6a6
+--- /dev/null
++++ b/test/data/stopwords/en
+@@ -0,0 +1,4 @@
++These
++are
++custom
++stopwords
+\ No newline at end of file
+diff --git a/test/extensions/hasher_test.rb b/test/extensions/hasher_test.rb
+new file mode 100644
+index 0000000..336a8b7
+--- /dev/null
++++ b/test/extensions/hasher_test.rb
+@@ -0,0 +1,67 @@
++require_relative '../test_helper'
++require 'tempfile'
++
++class HasherTest < Minitest::Test
++ def setup
++ @original_stopwords_path = Hasher::STOPWORDS_PATH.dup
++ end
++
++ def test_word_hash
++ hash = { good: 1, :'!' => 1, hope: 1, :"'" => 1, :'.' => 1, love: 1, word: 1, them: 1, test: 1 }
++ assert_equal hash, Hasher.word_hash("here are some good words of test's. I hope you love them!")
++ end
++
++ def test_clean_word_hash
++ hash = { good: 1, word: 1, hope: 1, love: 1, them: 1, test: 1 }
++ assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!")
++ end
++
++ def test_clean_word_hash_without_stemming
++ hash = { good: 1, words: 1, hope: 1, love: 1, them: 1, tests: 1 }
++ assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!", 'en', false)
++ end
++
++ def test_default_stopwords
++ refute_empty Hasher::STOPWORDS['en']
++ refute_empty Hasher::STOPWORDS['fr']
++ assert_empty Hasher::STOPWORDS['gibberish']
++ end
++
++ def test_loads_custom_stopwords
++ default_english_stopwords = Hasher::STOPWORDS['en']
++
++ # Remove the english stopwords
++ Hasher::STOPWORDS.delete('en')
++
++ # Add a custom stopwords path
++ Hasher::STOPWORDS_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../data/stopwords')
++
++ custom_english_stopwords = Hasher::STOPWORDS['en']
++
++ refute_equal default_english_stopwords, custom_english_stopwords
++ end
++
++ def test_add_custom_stopword_path
++ # Create stopword tempfile in current directory
++ temp_stopwords = Tempfile.new('xy', "#{File.dirname(__FILE__) + "/"}")
++
++ # Add some stopwords to tempfile
++ temp_stopwords << "this words fun"
++ temp_stopwords.close
++
++ # Get path of tempfile
++ temp_stopwords_path = File.dirname(temp_stopwords)
++
++ # Get tempfile name.
++ temp_stopwords_name = File.basename(temp_stopwords.path)
++
++ Hasher.add_custom_stopword_path(temp_stopwords_path)
++ hash = { list: 1, cool: 1 }
++ assert_equal hash, Hasher.clean_word_hash("this is a list of cool words!", temp_stopwords_name)
++ end
++
++ def teardown
++ Hasher::STOPWORDS.clear
++ Hasher::STOPWORDS_PATH.clear.concat @original_stopwords_path
++ end
++end
+diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb
+new file mode 100644
+index 0000000..d20caf2
+--- /dev/null
++++ b/test/lsi/lsi_test.rb
+@@ -0,0 +1,203 @@
++require File.dirname(__FILE__) + '/../test_helper'
++
++class LSITest < Minitest::Test
++ def setup
++ # we repeat principle words to help weight them.
++ # This test is rather delicate, since this system is mostly noise.
++ @str1 = 'This text deals with dogs. Dogs.'
++ @str2 = 'This text involves dogs too. Dogs! '
++ @str3 = 'This text revolves around cats. Cats.'
++ @str4 = 'This text also involves cats. Cats!'
++ @str5 = 'This text involves birds. Birds.'
++ end
++
++ def test_basic_indexing
++ lsi = ClassifierReborn::LSI.new
++ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++ assert !lsi.needs_rebuild?
++
++ # note that the closest match to str1 is str2, even though it is not
++ # the closest text match.
++ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
++ end
++
++ def test_not_auto_rebuild
++ lsi = ClassifierReborn::LSI.new auto_rebuild: false
++ lsi.add_item @str1, 'Dog'
++ lsi.add_item @str2, 'Dog'
++ assert lsi.needs_rebuild?
++ lsi.build_index
++ assert !lsi.needs_rebuild?
++ end
++
++ def test_basic_categorizing
++ lsi = ClassifierReborn::LSI.new
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++
++ assert_equal 'Dog', lsi.classify(@str1)
++ assert_equal 'Cat', lsi.classify(@str3)
++ assert_equal 'Bird', lsi.classify(@str5)
++ end
++
++ def test_basic_categorizing_with_score
++ lsi = ClassifierReborn::LSI.new
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++
++ assert_in_delta 2.49, lsi.classify_with_score(@str1)[1], 0.1
++ assert_in_delta 1.41, lsi.classify_with_score(@str3)[1], 0.1
++ assert_in_delta 1.99, lsi.classify_with_score(@str5)[1], 0.1
++ end
++
++ def test_scored_categories
++ lsi = ClassifierReborn::LSI.new
++ lsi.add_item @str1, 'Dog'
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++
++ scored_categories = lsi.scored_categories('dog bird cat')
++ assert_equal 2, scored_categories.size
++ assert_equal %w(Bird Dog), scored_categories.map(&:first)
++ end
++
++ def test_external_classifying
++ lsi = ClassifierReborn::LSI.new
++ bayes = ClassifierReborn::Bayes.new 'Dog', 'Cat', 'Bird'
++ lsi.add_item @str1, 'Dog'
++ bayes.train_dog @str1
++ lsi.add_item @str2, 'Dog'
++ bayes.train_dog @str2
++ lsi.add_item @str3, 'Cat'
++ bayes.train_cat @str3
++ lsi.add_item @str4, 'Cat'
++ bayes.train_cat @str4
++ lsi.add_item @str5, 'Bird'
++ bayes.train_bird @str5
++
++ # We're talking about dogs. Even though the text matches the corpus on
++ # cats better. Dogs have more semantic weight than cats. So bayes
++ # will fail here, but the LSI recognizes content.
++ tricky_case = 'This text revolves around dogs.'
++ assert_equal 'Dog', lsi.classify(tricky_case)
++ refute_equal 'Dog', bayes.classify(tricky_case)
++ end
++
++ def test_recategorize_interface
++ lsi = ClassifierReborn::LSI.new
++ lsi.add_item @str1, 'Dog'
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++
++ tricky_case = 'This text revolves around dogs.'
++ assert_equal 'Dog', lsi.classify(tricky_case)
++
++ # Recategorize as needed.
++ lsi.categories_for(@str1).clear.push 'Cow'
++ lsi.categories_for(@str2).clear.push 'Cow'
++
++ assert !lsi.needs_rebuild?
++ assert_equal 'Cow', lsi.classify(tricky_case)
++ end
++
++ def test_search
++ lsi = ClassifierReborn::LSI.new
++ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++
++ # Searching by content and text, note that @str2 comes up first, because
++ # both "dog" and "involve" are present. But, the next match is @str1 instead
++ # of @str4, because "dog" carries more weight than involves.
++ assert_equal([@str2, @str1, @str4, @str5, @str3],
++ lsi.search('dog involves', 100))
++
++ # Keyword search shows how the space is mapped out in relation to
++ # dog when magnitude is remove. Note the relations. We move from dog
++ # through involve and then finally to other words.
++ assert_equal([@str1, @str2, @str4, @str5, @str3],
++ lsi.search('dog', 5))
++ end
++
++ def test_serialize_safe
++ lsi = ClassifierReborn::LSI.new
++ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++
++ lsi_md = Marshal.dump lsi
++ lsi_m = Marshal.load lsi_md
++
++ assert_equal lsi_m.search('cat', 3), lsi.search('cat', 3)
++ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
++ end
++
++ def test_uncached_content_node_option
++ lsi = ClassifierReborn::LSI.new
++ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++ lsi.instance_variable_get(:@items).values.each do |node|
++ assert node.instance_of?(ContentNode)
++ end
++ end
++
++ def test_cached_content_node_option
++ lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
++ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++ lsi.instance_variable_get(:@items).values.each do |node|
++ assert node.instance_of?(CachedContentNode)
++ end
++ end
++
++ def test_clears_cached_content_node_cache
++ return unless $GSL
++
++ lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
++ lsi.add_item @str1, 'Dog'
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++
++ assert_equal 'Dog', lsi.classify('something about dogs, but not an exact dog string')
++
++ first_content_node = lsi.instance_variable_get(:@items).values.first
++ refute_nil first_content_node.instance_variable_get(:@transposed_search_vector)
++ lsi.clear_cache!
++ assert_nil first_content_node.instance_variable_get(:@transposed_search_vector)
++ end
++
++ def test_keyword_search
++ lsi = ClassifierReborn::LSI.new
++ lsi.add_item @str1, 'Dog'
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++
++ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
++ end
++
++ def test_invalid_searching_when_using_gsl
++ return unless $GSL
++ lsi = ClassifierReborn::LSI.new
++ lsi.add_item @str1, 'Dog'
++ lsi.add_item @str2, 'Dog'
++ lsi.add_item @str3, 'Cat'
++ lsi.add_item @str4, 'Cat'
++ lsi.add_item @str5, 'Bird'
++ assert_output(/There are no documents that are similar to penguin/) { lsi.search('penguin') }
++ end
++
++ def test_warn_when_adding_bad_document
++ lsi = ClassifierReborn::LSI.new
++ assert_output(/Input: 'i can' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly./) { lsi.add_item("i can") }
++ end
++
++ def test_summary
++ assert_equal 'This text involves dogs too [...] This text also involves cats', Summarizer.summary([@str1, @str2, @str3, @str4, @str5].join, 2)
++ end
++end
+diff --git a/test/lsi/word_list_test.rb b/test/lsi/word_list_test.rb
+new file mode 100644
+index 0000000..deffc41
+--- /dev/null
++++ b/test/lsi/word_list_test.rb
+@@ -0,0 +1,33 @@
++require_relative '../test_helper'
++
++class WordListTest < Minitest::Test
++ def test_size_does_not_count_words_twice
++ list = ClassifierReborn::WordList.new
++ assert list.size == 0
++
++ list.add_word('hello')
++ assert list.size == 1
++
++ list.add_word('hello')
++ assert list.size == 1
++
++ list.add_word('world')
++ assert list.size == 2
++ end
++
++ def test_brackets_return_correct_position_based_on_add_order
++ list = ClassifierReborn::WordList.new
++ list.add_word('hello')
++ list.add_word('world')
++ assert list['hello'] == 0
++ assert list['world'] == 1
++ end
++
++ def test_word_for_index_returns_correct_word_based_on_add_order
++ list = ClassifierReborn::WordList.new
++ list.add_word('hello')
++ list.add_word('world')
++ assert list.word_for_index(0) == 'hello'
++ assert list.word_for_index(1) == 'world'
++ end
++end
+diff --git a/test/test_helper.rb b/test/test_helper.rb
+new file mode 100644
+index 0000000..b539406
+--- /dev/null
++++ b/test/test_helper.rb
+@@ -0,0 +1,8 @@
++$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
++
++require 'minitest/autorun'
++require 'minitest/reporters'
++Minitest::Reporters.use!
++require 'pry'
++require 'classifier-reborn'
++include ClassifierReborn
diff --git a/debian/patches/series b/debian/patches/series
index 074bbd5..ae0454b 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,2 @@
0001-Replace-gemspec-git-execution.patch
+0002-Import-test-from-upstream.patch
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ruby-extras/ruby-classifier-reborn.git
More information about the Pkg-ruby-extras-commits
mailing list