[DRE-commits] [ruby-classifier-reborn] 06/08: Import test from upstream

Thu Aug 31 01:34:51 UTC 2017

This is an automated email from the git hooks/post-receive script.

uwabami-guest pushed a commit to branch master
in repository ruby-classifier-reborn.

commit 8c8baa1df0209a0d98ebd7a741862b850977db89
Author: Youhei SASAKI <uwabami at gfd-dennou.org>
Date:   Thu Aug 31 10:31:27 2017 +0900

    Import test from upstream
    
    Signed-off-by: Youhei SASAKI <uwabami at gfd-dennou.org>
---
 .../patches/0002-Import-test-from-upstream.patch   | 497 +++++++++++++++++++++
 debian/patches/series                              |   1 +
 2 files changed, 498 insertions(+)

diff --git a/debian/patches/0002-Import-test-from-upstream.patch b/debian/patches/0002-Import-test-from-upstream.patch
new file mode 100644
index 0000000..2a3c29c
--- /dev/null
+++ b/debian/patches/0002-Import-test-from-upstream.patch
@@ -0,0 +1,497 @@
+From: Youhei SASAKI <uwabami at gfd-dennou.org>
+Date: Thu, 31 Aug 2017 10:26:59 +0900
+Subject: Import test from upstream
+
+Signed-off-by: Youhei SASAKI <uwabami at gfd-dennou.org>
+---
+ test/bayes/bayesian_test.rb    | 125 +++++++++++++++++++++++++
+ test/data/stopwords/en         |   4 +
+ test/extensions/hasher_test.rb |  67 ++++++++++++++
+ test/lsi/lsi_test.rb           | 203 +++++++++++++++++++++++++++++++++++++++++
+ test/lsi/word_list_test.rb     |  33 +++++++
+ test/test_helper.rb            |   8 ++
+ 6 files changed, 440 insertions(+)
+ create mode 100755 test/bayes/bayesian_test.rb
+ create mode 100644 test/data/stopwords/en
+ create mode 100644 test/extensions/hasher_test.rb
+ create mode 100644 test/lsi/lsi_test.rb
+ create mode 100644 test/lsi/word_list_test.rb
+ create mode 100644 test/test_helper.rb
+
+diff --git a/test/bayes/bayesian_test.rb b/test/bayes/bayesian_test.rb
+new file mode 100755
+index 0000000..f2f355b
+--- /dev/null
++++ b/test/bayes/bayesian_test.rb
+@@ -0,0 +1,125 @@
++# encoding: utf-8
++
++require File.dirname(__FILE__) + '/../test_helper'
++class BayesianTest < Minitest::Test
++  def setup
++    @classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
++  end
++
++  def test_good_training
++     assert_equal ['love'], @classifier.train_interesting('love')
++  end
++
++  def test_training_with_utf8
++    assert_equal ['Água'], @classifier.train_interesting('Água')
++  end
++
++  def test_stemming_enabled_by_default
++    assert @classifier.stemmer_enabled?
++  end
++
++  def test_bad_training
++    assert_raises(StandardError) { @classifier.train_no_category 'words' }
++  end
++
++  def test_bad_method
++    assert_raises(NoMethodError) { @classifier.forget_everything_you_know '' }
++  end
++
++  def test_categories
++    assert_equal %w(Interesting Uninteresting).sort, @classifier.categories.sort
++  end
++
++  def test_categories_from_array
++    another_classifier = ClassifierReborn::Bayes.new %w(Interesting Uninteresting)
++    assert_equal another_classifier.categories.sort, @classifier.categories.sort
++  end
++
++  def test_add_category
++    @classifier.add_category 'Test'
++    assert_equal %w(Test Interesting Uninteresting).sort, @classifier.categories.sort
++  end
++
++  def test_dynamic_category_succeeds_with_auto_categorize
++    classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', auto_categorize: true
++    classifier.train('Ruby', 'I really sweet language')
++    assert classifier.categories.include?('Ruby')
++  end
++
++  def test_dynamic_category_fails_without_auto_categorize
++    assert_raises(ClassifierReborn::Bayes::CategoryNotFoundError) do
++      @classifier.train('Ruby', 'A really sweet language')
++    end
++    refute @classifier.categories.include?('Ruby')
++  end
++
++  def test_classification
++    @classifier.train_interesting 'here are some good words. I hope you love them'
++    @classifier.train_uninteresting 'here are some bad words, I hate you'
++    assert_equal 'Uninteresting', @classifier.classify('I hate bad words and you')
++  end
++
++  def test_classification_with_threshold
++    b = ClassifierReborn::Bayes.new 'Digit'
++    assert_equal 1, b.categories.size
++
++    refute b.threshold_enabled?
++    b.enable_threshold
++    assert b.threshold_enabled?
++    assert_equal 0.0, b.threshold # default
++
++    b.threshold = -7.0
++
++    10.times do |a_number|
++      b.train_digit(a_number.to_s)
++      b.train_digit(a_number.to_s)
++    end
++
++    10.times do |a_number|
++      assert_equal 'Digit', b.classify(a_number.to_s)
++    end
++
++    refute b.classify('xyzzy')
++  end
++
++  def test_classification_with_threshold_again
++    b = ClassifierReborn::Bayes.new 'Normal'
++    assert_equal 1, b.categories.size
++
++    refute b.threshold_enabled?
++    b.enable_threshold
++    assert b.threshold_enabled?
++    assert_equal 0.0, b.threshold # default
++
++    %w(
++      http://example.com/about
++      http://example.com/contact
++      http://example.com/download
++      http://example.com/login
++      http://example.com/logout
++      http://example.com/blog/2015-04-01
++    ).each do |url|
++      b.train_normal(url)
++    end
++
++    assert 'Normal', b.classify('http://example.com')
++    refute b.classify("http://example.com/login/?user='select * from users;'")
++  end
++
++  def test_classification_with_score
++    @classifier.train_interesting 'here are some good words. I hope you love them'
++    @classifier.train_uninteresting 'here are some bad words, I hate you'
++    assert_in_delta(-4.85, @classifier.classify_with_score('I hate bad words and you')[1], 0.1)
++  end
++
++  def test_untrain
++    @classifier.train_interesting 'here are some good words. I hope you love them'
++    @classifier.train_uninteresting 'here are some bad words, I hate you'
++    @classifier.add_category 'colors'
++    @classifier.train_colors 'red orange green blue seven'
++    classification_of_bad_data = @classifier.classify 'seven'
++    @classifier.untrain_colors 'seven'
++    classification_after_untrain = @classifier.classify 'seven'
++    refute_equal classification_of_bad_data, classification_after_untrain
++  end
++end
+diff --git a/test/data/stopwords/en b/test/data/stopwords/en
+new file mode 100644
+index 0000000..271c6a6
+--- /dev/null
++++ b/test/data/stopwords/en
+@@ -0,0 +1,4 @@
++These
++are
++custom
++stopwords
+\ No newline at end of file
+diff --git a/test/extensions/hasher_test.rb b/test/extensions/hasher_test.rb
+new file mode 100644
+index 0000000..336a8b7
+--- /dev/null
++++ b/test/extensions/hasher_test.rb
+@@ -0,0 +1,67 @@
++require_relative '../test_helper'
++require 'tempfile'
++
++class HasherTest < Minitest::Test
++  def setup
++    @original_stopwords_path = Hasher::STOPWORDS_PATH.dup
++  end
++
++  def test_word_hash
++    hash = { good: 1, :'!' => 1, hope: 1, :"'" => 1, :'.' => 1, love: 1, word: 1, them: 1, test: 1 }
++    assert_equal hash, Hasher.word_hash("here are some good words of test's. I hope you love them!")
++  end
++
++  def test_clean_word_hash
++    hash = { good: 1, word: 1, hope: 1, love: 1, them: 1, test: 1 }
++    assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!")
++  end
++
++  def test_clean_word_hash_without_stemming
++    hash = { good: 1, words: 1, hope: 1, love: 1, them: 1, tests: 1 }
++    assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!", 'en', false)
++  end
++
++  def test_default_stopwords
++    refute_empty Hasher::STOPWORDS['en']
++    refute_empty Hasher::STOPWORDS['fr']
++    assert_empty Hasher::STOPWORDS['gibberish']
++  end
++
++  def test_loads_custom_stopwords
++    default_english_stopwords = Hasher::STOPWORDS['en']
++
++    # Remove the english stopwords
++    Hasher::STOPWORDS.delete('en')
++
++    # Add a custom stopwords path
++    Hasher::STOPWORDS_PATH.unshift File.expand_path(File.dirname(__FILE__) + '/../data/stopwords')
++
++    custom_english_stopwords = Hasher::STOPWORDS['en']
++
++    refute_equal default_english_stopwords, custom_english_stopwords
++  end
++
++  def test_add_custom_stopword_path
++    # Create stopword tempfile in current directory
++    temp_stopwords = Tempfile.new('xy', "#{File.dirname(__FILE__) + "/"}")
++
++    # Add some stopwords to tempfile
++    temp_stopwords << "this words fun"
++    temp_stopwords.close
++
++    # Get path of tempfile
++    temp_stopwords_path = File.dirname(temp_stopwords)
++
++    # Get tempfile name.
++    temp_stopwords_name = File.basename(temp_stopwords.path)
++
++    Hasher.add_custom_stopword_path(temp_stopwords_path)
++    hash = { list: 1, cool: 1 }
++    assert_equal hash, Hasher.clean_word_hash("this is a list of cool words!", temp_stopwords_name)
++  end
++
++  def teardown
++    Hasher::STOPWORDS.clear
++    Hasher::STOPWORDS_PATH.clear.concat @original_stopwords_path
++  end
++end
+diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb
+new file mode 100644
+index 0000000..d20caf2
+--- /dev/null
++++ b/test/lsi/lsi_test.rb
+@@ -0,0 +1,203 @@
++require File.dirname(__FILE__) + '/../test_helper'
++
++class LSITest < Minitest::Test
++  def setup
++    # we repeat principle words to help weight them.
++    # This test is rather delicate, since this system is mostly noise.
++    @str1 = 'This text deals with dogs. Dogs.'
++    @str2 = 'This text involves dogs too. Dogs! '
++    @str3 = 'This text revolves around cats. Cats.'
++    @str4 = 'This text also involves cats. Cats!'
++    @str5 = 'This text involves birds. Birds.'
++  end
++
++  def test_basic_indexing
++    lsi = ClassifierReborn::LSI.new
++    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++    assert !lsi.needs_rebuild?
++
++    # note that the closest match to str1 is str2, even though it is not
++    # the closest text match.
++    assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
++  end
++
++  def test_not_auto_rebuild
++    lsi = ClassifierReborn::LSI.new auto_rebuild: false
++    lsi.add_item @str1, 'Dog'
++    lsi.add_item @str2, 'Dog'
++    assert lsi.needs_rebuild?
++    lsi.build_index
++    assert !lsi.needs_rebuild?
++  end
++
++  def test_basic_categorizing
++    lsi = ClassifierReborn::LSI.new
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++
++    assert_equal 'Dog', lsi.classify(@str1)
++    assert_equal 'Cat', lsi.classify(@str3)
++    assert_equal 'Bird', lsi.classify(@str5)
++  end
++
++  def test_basic_categorizing_with_score
++    lsi = ClassifierReborn::LSI.new
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++
++    assert_in_delta 2.49, lsi.classify_with_score(@str1)[1], 0.1
++    assert_in_delta 1.41, lsi.classify_with_score(@str3)[1], 0.1
++    assert_in_delta 1.99, lsi.classify_with_score(@str5)[1], 0.1
++  end
++
++  def test_scored_categories
++    lsi = ClassifierReborn::LSI.new
++    lsi.add_item @str1, 'Dog'
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++
++    scored_categories = lsi.scored_categories('dog bird cat')
++    assert_equal 2, scored_categories.size
++    assert_equal %w(Bird Dog), scored_categories.map(&:first)
++  end
++
++  def test_external_classifying
++    lsi = ClassifierReborn::LSI.new
++    bayes = ClassifierReborn::Bayes.new 'Dog', 'Cat', 'Bird'
++    lsi.add_item @str1, 'Dog'
++    bayes.train_dog @str1
++    lsi.add_item @str2, 'Dog'
++    bayes.train_dog @str2
++    lsi.add_item @str3, 'Cat'
++    bayes.train_cat @str3
++    lsi.add_item @str4, 'Cat'
++    bayes.train_cat @str4
++    lsi.add_item @str5, 'Bird'
++    bayes.train_bird @str5
++
++    # We're talking about dogs. Even though the text matches the corpus on
++    # cats better.  Dogs have more semantic weight than cats. So bayes
++    # will fail here, but the LSI recognizes content.
++    tricky_case = 'This text revolves around dogs.'
++    assert_equal 'Dog', lsi.classify(tricky_case)
++    refute_equal 'Dog', bayes.classify(tricky_case)
++  end
++
++  def test_recategorize_interface
++    lsi = ClassifierReborn::LSI.new
++    lsi.add_item @str1, 'Dog'
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++
++    tricky_case = 'This text revolves around dogs.'
++    assert_equal 'Dog', lsi.classify(tricky_case)
++
++    # Recategorize as needed.
++    lsi.categories_for(@str1).clear.push 'Cow'
++    lsi.categories_for(@str2).clear.push 'Cow'
++
++    assert !lsi.needs_rebuild?
++    assert_equal 'Cow', lsi.classify(tricky_case)
++  end
++
++  def test_search
++    lsi = ClassifierReborn::LSI.new
++    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++
++    # Searching by content and text, note that @str2 comes up first, because
++    # both "dog" and "involve" are present. But, the next match is @str1 instead
++    # of @str4, because "dog" carries more weight than involves.
++    assert_equal([@str2, @str1, @str4, @str5, @str3],
++                 lsi.search('dog involves', 100))
++
++    # Keyword search shows how the space is mapped out in relation to
++    # dog when magnitude is remove. Note the relations. We move from dog
++    # through involve and then finally to other words.
++    assert_equal([@str1, @str2, @str4, @str5, @str3],
++                 lsi.search('dog', 5))
++  end
++
++  def test_serialize_safe
++    lsi = ClassifierReborn::LSI.new
++    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++
++    lsi_md = Marshal.dump lsi
++    lsi_m = Marshal.load lsi_md
++
++    assert_equal lsi_m.search('cat', 3), lsi.search('cat', 3)
++    assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
++  end
++
++  def test_uncached_content_node_option
++    lsi = ClassifierReborn::LSI.new
++    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++    lsi.instance_variable_get(:@items).values.each do |node|
++      assert node.instance_of?(ContentNode)
++    end
++  end
++
++  def test_cached_content_node_option
++    lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
++    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
++    lsi.instance_variable_get(:@items).values.each do |node|
++      assert node.instance_of?(CachedContentNode)
++    end
++  end
++
++  def test_clears_cached_content_node_cache
++    return unless $GSL
++
++    lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
++    lsi.add_item @str1, 'Dog'
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++
++    assert_equal 'Dog', lsi.classify('something about dogs, but not an exact dog string')
++
++    first_content_node = lsi.instance_variable_get(:@items).values.first
++    refute_nil first_content_node.instance_variable_get(:@transposed_search_vector)
++    lsi.clear_cache!
++    assert_nil first_content_node.instance_variable_get(:@transposed_search_vector)
++  end
++
++  def test_keyword_search
++    lsi = ClassifierReborn::LSI.new
++    lsi.add_item @str1, 'Dog'
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++
++    assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
++  end
++
++  def test_invalid_searching_when_using_gsl
++    return unless $GSL
++    lsi = ClassifierReborn::LSI.new
++    lsi.add_item @str1, 'Dog'
++    lsi.add_item @str2, 'Dog'
++    lsi.add_item @str3, 'Cat'
++    lsi.add_item @str4, 'Cat'
++    lsi.add_item @str5, 'Bird'
++    assert_output(/There are no documents that are similar to penguin/) { lsi.search('penguin') }
++  end
++
++  def test_warn_when_adding_bad_document
++    lsi = ClassifierReborn::LSI.new
++    assert_output(/Input: 'i can' is entirely stopwords or words with 2 or fewer characters. Classifier-Reborn cannot handle this document properly./) { lsi.add_item("i can") }
++  end
++
++  def test_summary
++    assert_equal 'This text involves dogs too [...] This text also involves cats', Summarizer.summary([@str1, @str2, @str3, @str4, @str5].join, 2)
++  end
++end
+diff --git a/test/lsi/word_list_test.rb b/test/lsi/word_list_test.rb
+new file mode 100644
+index 0000000..deffc41
+--- /dev/null
++++ b/test/lsi/word_list_test.rb
+@@ -0,0 +1,33 @@
++require_relative '../test_helper'
++
++class WordListTest < Minitest::Test
++  def test_size_does_not_count_words_twice
++    list = ClassifierReborn::WordList.new
++    assert list.size == 0
++
++    list.add_word('hello')
++    assert list.size == 1
++
++    list.add_word('hello')
++    assert list.size == 1
++
++    list.add_word('world')
++    assert list.size == 2
++  end
++
++  def test_brackets_return_correct_position_based_on_add_order
++    list = ClassifierReborn::WordList.new
++    list.add_word('hello')
++    list.add_word('world')
++    assert list['hello'] == 0
++    assert list['world'] == 1
++  end
++
++  def test_word_for_index_returns_correct_word_based_on_add_order
++    list = ClassifierReborn::WordList.new
++    list.add_word('hello')
++    list.add_word('world')
++    assert list.word_for_index(0) == 'hello'
++    assert list.word_for_index(1) == 'world'
++  end
++end
+diff --git a/test/test_helper.rb b/test/test_helper.rb
+new file mode 100644
+index 0000000..b539406
+--- /dev/null
++++ b/test/test_helper.rb
+@@ -0,0 +1,8 @@
++$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
++
++require 'minitest/autorun'
++require 'minitest/reporters'
++Minitest::Reporters.use!
++require 'pry'
++require 'classifier-reborn'
++include ClassifierReborn
diff --git a/debian/patches/series b/debian/patches/series
index 074bbd5..ae0454b 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1 +1,2 @@
 0001-Replace-gemspec-git-execution.patch
+0002-Import-test-from-upstream.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ruby-extras/ruby-classifier-reborn.git