[DRE-commits] [ruby-classifier-reborn] 02/02: Import test from upstream

Youhei SASAKI uwabami-guest at moszumanska.debian.org
Thu Aug 31 01:34:52 UTC 2017


This is an automated email from the git hooks/post-receive script.

uwabami-guest pushed a commit to branch patch-queue/master
in repository ruby-classifier-reborn.

commit 05259de331f3b3137e545296ced108a683decebf
Author: Youhei SASAKI <uwabami at gfd-dennou.org>
Date:   Thu Aug 31 10:26:59 2017 +0900

    Import test from upstream
    
    Signed-off-by: Youhei SASAKI <uwabami at gfd-dennou.org>
---
 test/bayes/bayesian_test.rb    | 125 +++++++++++++++++++++++++
 test/data/stopwords/en         |   4 +
 test/extensions/hasher_test.rb |  67 ++++++++++++++
 test/lsi/lsi_test.rb           | 203 +++++++++++++++++++++++++++++++++++++++++
 test/lsi/word_list_test.rb     |  33 +++++++
 test/test_helper.rb            |   8 ++
 6 files changed, 440 insertions(+)

diff --git a/test/bayes/bayesian_test.rb b/test/bayes/bayesian_test.rb
new file mode 100755
index 0000000..f2f355b
--- /dev/null
+++ b/test/bayes/bayesian_test.rb
@@ -0,0 +1,125 @@
+# encoding: utf-8
+
+require File.dirname(__FILE__) + '/../test_helper'
+class BayesianTest < Minitest::Test
+  def setup
+    @classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
+  end
+
+  def test_good_training
+     assert_equal ['love'], @classifier.train_interesting('love')
+  end
+
+  def test_training_with_utf8
+    assert_equal ['Água'], @classifier.train_interesting('Água')
+  end
+
+  def test_stemming_enabled_by_default
+    assert @classifier.stemmer_enabled?
+  end
+
+  def test_bad_training
+    assert_raises(StandardError) { @classifier.train_no_category 'words' }
+  end
+
+  def test_bad_method
+    assert_raises(NoMethodError) { @classifier.forget_everything_you_know '' }
+  end
+
+  def test_categories
+    assert_equal %w(Interesting Uninteresting).sort, @classifier.categories.sort
+  end
+
+  def test_categories_from_array
+    another_classifier = ClassifierReborn::Bayes.new %w(Interesting Uninteresting)
+    assert_equal another_classifier.categories.sort, @classifier.categories.sort
+  end
+
+  def test_add_category
+    @classifier.add_category 'Test'
+    assert_equal %w(Test Interesting Uninteresting).sort, @classifier.categories.sort
+  end
+
+  def test_dynamic_category_succeeds_with_auto_categorize
+    classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', auto_categorize: true
+    classifier.train('Ruby', 'I really sweet language')
+    assert classifier.categories.include?('Ruby')
+  end
+
+  def test_dynamic_category_fails_without_auto_categorize
+    assert_raises(ClassifierReborn::Bayes::CategoryNotFoundError) do
+      @classifier.train('Ruby', 'A really sweet language')
+    end
+    refute @classifier.categories.include?('Ruby')
+  end
+
+  def test_classification
+    @classifier.train_interesting 'here are some good words. I hope you love them'
+    @classifier.train_uninteresting 'here are some bad words, I hate you'
+    assert_equal 'Uninteresting', @classifier.classify('I hate bad words and you')
+  end
+
+  def test_classification_with_threshold
+    b = ClassifierReborn::Bayes.new 'Digit'
+    assert_equal 1, b.categories.size
+
+    refute b.threshold_enabled?
+    b.enable_threshold
+    assert b.threshold_enabled?
+    assert_equal 0.0, b.threshold # default
+
+    b.threshold = -7.0
+
+    10.times do |a_number|
+      b.train_digit(a_number.to_s)
+      b.train_digit(a_number.to_s)
+    end
+
+    10.times do |a_number|
+      assert_equal 'Digit', b.classify(a_number.to_s)
+    end
+
+    refute b.classify('xyzzy')
+  end
+
+  def test_classification_with_threshold_again
+    b = ClassifierReborn::Bayes.new 'Normal'
+    assert_equal 1, b.categories.size
+
+    refute b.threshold_enabled?
+    b.enable_threshold
+    assert b.threshold_enabled?
+    assert_equal 0.0, b.threshold # default
+
+    %w(
+      http://example.com/about
+      http://example.com/contact
+      http://example.com/download
+      http://example.com/login
+      http://example.com/logout
+      http://example.com/blog/2015-04-01
+    ).each do |url|
+      b.train_normal(url)
+    end
+
+    assert 'Normal', b.classify('http://example.com')
+    refute b.classify("http://example.com/login/?user='select * from users;'")
+  end
+
+  def test_classification_with_score
+    @classifier.train_interesting 'here are some good words. I hope you love them'
+    @classifier.train_uninteresting 'here are some bad words, I hate you'
+    assert_in_delta(-4.85, @classifier.classify_with_score('I hate bad words and you')[1], 0.1)
+  end
+
+  def test_untrain
+    @classifier.train_interesting 'here are some good words. I hope you love them'
+    @classifier.train_uninteresting 'here are some bad words, I hate you'
+    @classifier.add_category 'colors'
+    @classifier.train_colors 'red orange green blue seven'
+    classification_of_bad_data = @classifier.classify 'seven'
+    @classifier.untrain_colors 'seven'
+    classification_after_untrain = @classifier.classify 'seven'
+    refute_equal classification_of_bad_data, classification_after_untrain
+  end
+end
diff --git a/test/data/stopwords/en b/test/data/stopwords/en
new file mode 100644
index 0000000..271c6a6
--- /dev/null
+++ b/test/data/stopwords/en
@@ -0,0 +1,4 @@
+These
+are
+custom
+stopwords
\ No newline at end of file
diff --git a/test/extensions/hasher_test.rb b/test/extensions/hasher_test.rb
new file mode 100644
index 0000000..336a8b7
--- /dev/null
+++ b/test/extensions/hasher_test.rb
@@ -0,0 +1,67 @@
+require_relative '../test_helper'
+require 'tempfile'
+
+class HasherTest < Minitest::Test
+  def setup
+    @original_stopwords_path = Hasher::STOPWORDS_PATH.dup
+  end
+
+  def test_word_hash
+    hash = { good: 1, :'!' => 1, hope: 1, :"'" => 1, :'.' => 1, love: 1, word: 1, them: 1, test: 1 }
+    assert_equal hash, Hasher.word_hash("here are some good words of test's. I hope you love them!")
+  end
+
+  def test_clean_word_hash
+    hash = { good: 1, word: 1, hope: 1, love: 1, them: 1, test: 1 }
+    assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!")
+  end
+
+  def test_clean_word_hash_without_stemming
+    hash = { good: 1, words: 1, hope: 1, love: 1, them: 1, tests: 1 }
+    assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!", 'en', false)
+  end
+
+  def test_default_stopwords
+    refute_empty Hasher::STOPWORDS['en']
+    refute_empty Hasher::STOPWORDS['fr']
+    assert_empty Hasher::STOPWORDS['gibberish']
+  end
+
+  def test_loads_custom_stopwords
+    default_english_stopwords = Hasher::STOPWORDS['en']
+
+    # Remove the english stopwords
+    Hasher::STOPWORDS.delete('en')
+
+    # Add a custom stopwords path
+    Hasher::STOPWORDS_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../data/stopwords')
+
+    custom_english_stopwords = Hasher::STOPWORDS['en']
+
+    refute_equal default_english_stopwords, custom_english_stopwords
+  end
+
+  def test_add_custom_stopword_path
+    # Create stopword tempfile in current directory
+    temp_stopwords = Tempfile.new('xy', "#{File.dirname(__FILE__) + "/"}")
+
+    # Add some stopwords to tempfile
+    temp_stopwords << "this words fun"
+    temp_stopwords.close
+
+    # Get path of tempfile
+    temp_stopwords_path = File.dirname(temp_stopwords)
+
+    # Get tempfile name.
+    temp_stopwords_name = File.basename(temp_stopwords.path)
+
+    Hasher.add_custom_stopword_path(temp_stopwords_path)
+    hash = { list: 1, cool: 1 }
+    assert_equal hash, Hasher.clean_word_hash("this is a list of cool words!", temp_stopwords_name)
+  end
+
+  def teardown
+    Hasher::STOPWORDS.clear
+    Hasher::STOPWORDS_PATH.clear.concat @original_stopwords_path
+  end
+end
diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb
new file mode 100644
index 0000000..d20caf2
--- /dev/null
+++ b/test/lsi/lsi_test.rb
@@ -0,0 +1,203 @@
+require File.dirname(__FILE__) + '/../test_helper'
+
+class LSITest < Minitest::Test
+  def setup
+    # we repeat principle words to help weight them.
+    # This test is rather delicate, since this system is mostly noise.
+    @str1 = 'This text deals with dogs. Dogs.'
+    @str2 = 'This text involves dogs too. Dogs! '
+    @str3 = 'This text revolves around cats. Cats.'
+    @str4 = 'This text also involves cats. Cats!'
+    @str5 = 'This text involves birds. Birds.'
+  end
+
+  def test_basic_indexing
+    lsi = ClassifierReborn::LSI.new
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+    assert !lsi.needs_rebuild?
+
+    # note that the closest match to str1 is str2, even though it is not
+    # the closest text match.
+    assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
+  end
+
+  def test_not_auto_rebuild
+    lsi = ClassifierReborn::LSI.new auto_rebuild: false
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    assert lsi.needs_rebuild?
+    lsi.build_index
+    assert !lsi.needs_rebuild?
+  end
+
+  def test_basic_categorizing
+    lsi = ClassifierReborn::LSI.new
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+
+    assert_equal 'Dog', lsi.classify(@str1)
+    assert_equal 'Cat', lsi.classify(@str3)
+    assert_equal 'Bird', lsi.classify(@str5)
+  end
+
+  def test_basic_categorizing_with_score
+    lsi = ClassifierReborn::LSI.new
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+
+    assert_in_delta 2.49, lsi.classify_with_score(@str1)[1], 0.1
+    assert_in_delta 1.41, lsi.classify_with_score(@str3)[1], 0.1
+    assert_in_delta 1.99, lsi.classify_with_score(@str5)[1], 0.1
+  end
+
+  def test_scored_categories
+    lsi = ClassifierReborn::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+
+    scored_categories = lsi.scored_categories('dog bird cat')
+    assert_equal 2, scored_categories.size
+    assert_equal %w(Bird Dog), scored_categories.map(&:first)
+  end
+
+  def test_external_classifying
+    lsi = ClassifierReborn::LSI.new
+    bayes = ClassifierReborn::Bayes.new 'Dog', 'Cat', 'Bird'
+    lsi.add_item @str1, 'Dog'
+    bayes.train_dog @str1
+    lsi.add_item @str2, 'Dog'
+    bayes.train_dog @str2
+    lsi.add_item @str3, 'Cat'
+    bayes.train_cat @str3
+    lsi.add_item @str4, 'Cat'
+    bayes.train_cat @str4
+    lsi.add_item @str5, 'Bird'
+    bayes.train_bird @str5
+
+    # We're talking about dogs. Even though the text matches the corpus on
+    # cats better.  Dogs have more semantic weight than cats. So bayes
+    # will fail here, but the LSI recognizes content.
+    tricky_case = 'This text revolves around dogs.'
+    assert_equal 'Dog', lsi.classify(tricky_case)
+    refute_equal 'Dog', bayes.classify(tricky_case)
+  end
+
+  def test_recategorize_interface
+    lsi = ClassifierReborn::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+
+    tricky_case = 'This text revolves around dogs.'
+    assert_equal 'Dog', lsi.classify(tricky_case)
+
+    # Recategorize as needed.
+    lsi.categories_for(@str1).clear.push 'Cow'
+    lsi.categories_for(@str2).clear.push 'Cow'
+
+    assert !lsi.needs_rebuild?
+    assert_equal 'Cow', lsi.classify(tricky_case)
+  end
+
+  def test_search
+    lsi = ClassifierReborn::LSI.new
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+
+    # Searching by content and text, note that @str2 comes up first, because
+    # both "dog" and "involve" are present. But, the next match is @str1 instead
+    # of @str4, because "dog" carries more weight than involves.
+    assert_equal([@str2, @str1, @str4, @str5, @str3],
+                 lsi.search('dog involves', 100))
+
+    # Keyword search shows how the space is mapped out in relation to
+    # dog when magnitude is remove. Note the relations. We move from dog
+    # through involve and then finally to other words.
+    assert_equal([@str1, @str2, @str4, @str5, @str3],
+                 lsi.search('dog', 5))
+  end
+
+  def test_serialize_safe
+    lsi = ClassifierReborn::LSI.new
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+
+    lsi_md = Marshal.dump lsi
+    lsi_m = Marshal.load lsi_md
+
+    assert_equal lsi_m.search('cat', 3), lsi.search('cat', 3)
+    assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
+  end
+
+  def test_uncached_content_node_option
+    lsi = ClassifierReborn::LSI.new
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+    lsi.instance_variable_get(:@items).values.each do |node|
+      assert node.instance_of?(ContentNode)
+    end
+  end
+
+  def test_cached_content_node_option
+    lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+    lsi.instance_variable_get(:@items).values.each do |node|
+      assert node.instance_of?(CachedContentNode)
+    end
+  end
+
+  def test_clears_cached_content_node_cache
+    return unless $GSL
+
+    lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+
+    assert_equal 'Dog', lsi.classify('something about dogs, but not an exact dog string')
+
+    first_content_node = lsi.instance_variable_get(:@items).values.first
+    refute_nil first_content_node.instance_variable_get(:@transposed_search_vector)
+    lsi.clear_cache!
+    assert_nil first_content_node.instance_variable_get(:@transposed_search_vector)
+  end
+
+  def test_keyword_search
+    lsi = ClassifierReborn::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+
+    assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
+  end
+
+  def test_invalid_searching_when_using_gsl
+    return unless $GSL
+    lsi = ClassifierReborn::LSI.new
+    lsi.add_item @str1, 'Dog'
+    lsi.add_item @str2, 'Dog'
+    lsi.add_item @str3, 'Cat'
+    lsi.add_item @str4, 'Cat'
+    lsi.add_item @str5, 'Bird'
+    assert_output(/There are no documents that are similar to penguin/) { lsi.search('penguin') }
+  end
+
+  def test_warn_when_adding_bad_document
+    lsi = ClassifierReborn::LSI.new
+    assert_output(/Input: 'i can' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly./) { lsi.add_item("i can") }
+  end
+
+  def test_summary
+    assert_equal 'This text involves dogs too [...] This text also involves cats', Summarizer.summary([@str1, @str2, @str3, @str4, @str5].join, 2)
+  end
+end
diff --git a/test/lsi/word_list_test.rb b/test/lsi/word_list_test.rb
new file mode 100644
index 0000000..deffc41
--- /dev/null
+++ b/test/lsi/word_list_test.rb
@@ -0,0 +1,33 @@
+require_relative '../test_helper'
+
+class WordListTest < Minitest::Test
+  def test_size_does_not_count_words_twice
+    list = ClassifierReborn::WordList.new
+    assert list.size == 0
+
+    list.add_word('hello')
+    assert list.size == 1
+
+    list.add_word('hello')
+    assert list.size == 1
+
+    list.add_word('world')
+    assert list.size == 2
+  end
+
+  def test_brackets_return_correct_position_based_on_add_order
+    list = ClassifierReborn::WordList.new
+    list.add_word('hello')
+    list.add_word('world')
+    assert list['hello'] == 0
+    assert list['world'] == 1
+  end
+
+  def test_word_for_index_returns_correct_word_based_on_add_order
+    list = ClassifierReborn::WordList.new
+    list.add_word('hello')
+    list.add_word('world')
+    assert list.word_for_index(0) == 'hello'
+    assert list.word_for_index(1) == 'world'
+  end
+end
diff --git a/test/test_helper.rb b/test/test_helper.rb
new file mode 100644
index 0000000..b539406
--- /dev/null
+++ b/test/test_helper.rb
@@ -0,0 +1,8 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
+
+require 'minitest/autorun'
+require 'minitest/reporters'
+Minitest::Reporters.use!
+require 'pry'
+require 'classifier-reborn'
+include ClassifierReborn

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ruby-extras/ruby-classifier-reborn.git



More information about the Pkg-ruby-extras-commits mailing list