[DRE-commits] [bsfilter] 01/05: Imported Upstream version 1.0.19
Christian Hofstaedtler
zeha at moszumanska.debian.org
Sun Dec 22 17:44:57 UTC 2013
This is an automated email from the git hooks/post-receive script.
zeha pushed a commit to branch master
in repository bsfilter.
commit cb3939f33227a70cfbb5500eb1461dc1ae5d75d5
Author: Christian Hofstaedtler <zeha at debian.org>
Date: Sun Dec 22 18:21:03 2013 +0100
Imported Upstream version 1.0.19
---
bsfilter/bsfilter | 243 ++++++++++++++++++++++++++++++-------------------
bsfilter/bsfilter.exe | Bin 1654784 -> 0 bytes
bsfilter/bsfilter.exr | 38 --------
bsfilter/bsfilter.exy | 44 ---------
bsfilter/bsfilterw.exe | Bin 1658880 -> 0 bytes
htdocs/index-e.html | 5 +-
htdocs/index.html | 28 +-----
htdocs/mew.html | 22 ++++-
mua/mew6.4/mew.el | 50 ++++++++++
test/test.rb | 19 +++-
10 files changed, 237 insertions(+), 212 deletions(-)
diff --git a/bsfilter/bsfilter b/bsfilter/bsfilter
index a1d6244..e87751f 100755
--- a/bsfilter/bsfilter
+++ b/bsfilter/bsfilter
@@ -1,5 +1,5 @@
#! /usr/bin/env ruby
-## -*-Ruby-*- $Id: bsfilter,v 1.86 2010/11/20 10:07:45 nabeken Exp $
+## -*-Ruby-*- $Id: bsfilter,v 1.87 2013/11/03 10:22:15 nabeken Exp $
## Copyright (C) 2003, 2004, 2005, 2006 NABEYA Kenichi
##
## This program is free software; you can redistribute it and/or modify
@@ -29,9 +29,9 @@ class Bsfilter
end
attr_accessor :token_dbs
- Release = "$Name: release_1_0_17 $".split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.')
+ Release = "$Name: release_1_0_19 $".split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.')
Release.concat("-") if (Release == "")
- Revision = "$Revision: 1.86 $".gsub(/[^\.\d]/, '')
+ Revision = "$Revision: 1.87 $".gsub(/[^\.\d]/, '')
Languages = ["C", "ja"]
Default_Language = "C"
@@ -88,7 +88,7 @@ class Bsfilter
CODESET_GB18030 = "GB18030"
CODESET_UTF8 = "UTF-8"
PATTERN_UTF8 = '[\xe0-\xef][\x80-\xbf][\x80-\xbf][\xe0-\xef][\x80-\xbf][\x80-\xbf]'
- RE_UTF8 = Regexp.new(PATTERN_UTF8, 'n')
+ RE_UTF8 = Regexp.new(PATTERN_UTF8, nil, 'n')
ALL_TAGS = ["html", "head", "title", "meta", "body", "div", "spam",
"h1", "h2", "h3", "h4", "h5", "h6",
@@ -280,37 +280,21 @@ class Bsfilter
end
def latin2ascii(str)
- newstr = str.tr("\x92\x93\x94", "'''")
- newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc", "AAAAAAEEEEIIIIOOOOOUUUU")
- newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc", "aaaaaaeeeeiiiiooooouuuu")
+ str.force_encoding('ASCII-8BIT')
+ newstr = str.tr("\x92\x93\x94".force_encoding('ASCII-8BIT'), "'''")
+ newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc".force_encoding('ASCII-8BIT'), "AAAAAAEEEEIIIIOOOOOUUUU")
+ newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc".force_encoding('ASCII-8BIT'), "aaaaaaeeeeiiiiooooouuuu")
return newstr
end
- def define_safe_iconv
- def Iconv.safe_iconv(tocode, fromcode, *strs)
- return strs.map do |str|
- array = Array::new
- strs.each do |str|
- str.split(/(\s+)/).each do |word|
- begin
- array.push(Iconv.iconv(tocode, fromcode, word)[0])
- rescue
- array.push(' ')
- end
- end
- end
- array.join
- end
- end
- def Iconv.u2eucjp(str)
- return NKF::nkf('-e -E -X -Z0', (Iconv.safe_iconv(CODESET_EUCJP, CODESET_UTF8, str))[0])
- end
- def Iconv.u2latin(str)
- return (Iconv.safe_iconv(CODESET_LATIN, CODESET_UTF8, str))[0]
- end
- def Iconv.gb180302eucjp(str)
- return (Iconv.safe_iconv(CODESET_EUCJP, CODESET_GB18030, str))[0]
- end
+ def u2eucjp(str)
+ return NKF::nkf('-e -E -X -Z0', str.encode('EUC-JP', 'UTF-8', :undef => :replace, :invalid => :replace))
+ end
+ def u2latin(str)
+ return str.encode('US-ASCII', 'UTF-8', :undef => :replace, :invalid => :replace)
+ end
+ def gb180302eucjp(str)
+ return str.encode('EUC-JP', 'BIG5', :undef => :replace, :invalid => :replace)
end
def open_ro(file)
@@ -477,7 +461,7 @@ EOM
end
def show_new_token(db)
- db.each_ct do |(category, token)|
+ db.each_ct do |category, token|
if (! value(category, token) || (value(category, token) == 0))
@options["message-fh"].printf("new %s %s\n", category, token)
end
@@ -501,7 +485,7 @@ EOM
end
def export(fh)
- each_ct do |(category, token)|
+ each_ct do |category, token|
fh.printf("%s %s %s %g\n", @language, category, token, value(category, token)) if (value(category, token))
end
end
@@ -641,7 +625,7 @@ EOM
def each_ct
@dbm.each_key do |ct|
- (category, token) = ct.split(Regexp.new(MAGIC), 2)
+ (category, token) = ct.force_encoding('ASCII-8BIT').split(Regexp.new(MAGIC), 2)
yield(category, token) if (category && token)
end
end
@@ -917,7 +901,7 @@ EOM
end
end
end
- return nil
+ return [nil, nil]
end
def get_lang_from_buf(buf, html_flag)
@@ -925,11 +909,18 @@ EOM
end
def get_lang(buf, html_flag=false)
- reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}", nil, 'e') # kana in euc-jp without zenkaku-space
- reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}", nil, 's') # kana in shift-jis
- reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}", nil, 'u') # kana in utf8
- reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old
- reg_gb18030_possible = Regexp::compile('[\x80-\x9f]', nil, 'n')
+## reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}", nil, 'e') # kana in euc-jp without zenkaku-space
+ reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}".force_encoding('EUC-JP'))
+## reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}", nil, 's') # kana in shift-jis
+ reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}".force_encoding('SHIFT_JIS'))
+
+## reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}", nil, 'u') # kana in utf8
+ reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}".force_encoding('UTF-8'))
+
+## reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old
+ reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]".force_encoding('ASCII-8BIT'))
+## reg_gb18030_possible = Regexp::compile('[\x80-\x9f]', nil, 'n')
+ reg_gb18030_possible = Regexp::compile('[\x80-\x9f]'.force_encoding('ASCII-8BIT'))
## reg_char_utf8 = Regexp::compile('(^\w+: .*|charset="?)(utf-8)', Regexp::IGNORECASE, 'n')
## reg_cte_bin = Regexp::compile('\Acontent-transfer-encoding\s*:\s*(base64|quoted-printable)', Regexp::IGNORECASE, 'n')
@@ -940,21 +931,24 @@ EOM
if (html_flag)
str = decode_character_reference2u(str)
end
- if (str =~ reg_gb18030_possible)
+ if (str.force_encoding('ASCII-8BIT') =~ reg_gb18030_possible)
gb18030_possible = true
end
- case str.gsub(/\s/, '')
- when reg_utf8
+ str_utf8 = str.encode('UTF-16BE', 'UTF-8', :undef => :replace, :invalid => :replace).encode('UTF-8', 'UTF-16BE', :undef => :replace, :invalid => :replace)
+ str_sjis = str.encode('UTF-16BE', 'SHIFT_JIS', :undef => :replace, :invalid => :replace).encode('SHIFT_JIS', 'UTF-16BE', :undef => :replace, :invalid => :replace)
+ str_euc = str.encode('UTF-16BE', 'EUC-JP', :undef => :replace, :invalid => :replace).encode('EUC-JP', 'UTF-16BE', :undef => :replace, :invalid => :replace)
+
+ if (str_utf8 =~ reg_utf8)
@options["message-fh"].printf("lang ja utf8\n") if (@options["debug"])
return ["ja", "utf8"]
- when reg_jis
+ elsif (str.force_encoding('ASCII-8BIT') =~ reg_jis)
@options["message-fh"].printf("lang ja jis\n") if (@options["debug"])
return ["ja", "jis"]
- when reg_sjis
+ elsif (str_sjis =~ reg_sjis)
@options["message-fh"].printf("lang ja sjis\n") if (@options["debug"])
return ["ja", "sjis"]
- when reg_euc
+ elsif (str_euc =~ reg_euc)
if (gb18030_possible)
@options["message-fh"].printf("lang ja gb18030\n") if (@options["debug"])
return ["ja", "gb18030"]
@@ -1052,10 +1046,27 @@ EOM
@method = Proc::new {|s| block(s)}
when "mecab"
@method = Proc::new {|s| mecab(s)}
+ meishi_euc = "\xcc\xbe\xbb\xec".force_encoding('ASCII-8BIT')
+ meishi_sjis = meishi_euc.encode('SHIFT_JIS', 'EUC-JP').force_encoding('ASCII-8BIT')
+ meishi_utf8 = meishi_euc.encode('UTF-8', 'EUC-JP').force_encoding('ASCII-8BIT')
if (defined?(MeCab::VERSION)) # defined after 0.90
@m = MeCab::Tagger.new("-Ochasen")
+ node = @m.parseToNode('this is a pen')
+ hinshi = node.next.feature.force_encoding('ASCII-8BIT').split(/,/)[0]
+ else
+ @m = MeCab::Tagger.new("-Ochasen")
+ node = @m.parseToNode('this is a pen')
+ hinshi = node.next.getFeature.force_encoding('ASCII-8BIT').split(/,/)[0]
+ end
+ case hinshi
+ when meishi_euc
+ @m_dic_enc = Encoding::EUC_JP
+ when meishi_sjis
+ @m_dic_enc = Encoding::SHIFT_JIS
+ when meishi_utf8
+ @m_dic_enc = Encoding::UTF_8
else
- @m = MeCab::Tagger.new([$0, "-Ochasen"])
+ @m_dic_enc = Encoding::default_external
end
when "chasen"
Chasen.getopt("-F", '%H %m\n', "-j")
@@ -1071,10 +1082,18 @@ EOM
@method.call(str)
end
- Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+", nil, 'e')
- Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+", nil, 'e')
- Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
- Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
+## Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+", nil, 'e')
+ Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+".force_encoding('EUC-JP'))
+ Reg_kanji_ASCII_8BIT = Regexp::compile("[\xb0\xa1-\xf4\xa4]+".force_encoding('ASCII-8BIT'))
+## Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+", nil, 'e')
+ Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+".force_encoding('EUC-JP'))
+## Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
+ Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('EUC-JP'))
+# Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4 \xa1\xbc \xa5\xa1-\xa5\xf6]".force_encoding('ASCII-8BIT'))
+
+## Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
+ Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('EUC-JP'))
+# Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]".force_encoding('ASCII-8BIT'))
def kakasi(str)
str = str.gsub(/[\x00-\x7f]/, ' ')
@@ -1092,8 +1111,9 @@ EOM
end
def mecab(str)
+ str = str.encode(@m_dic_enc, :invalid => :replace, :undef => :replace, :replace => ' ')
str = str.gsub(/[\x00-\x7f]/, ' ')
- if (str =~ /\A +\z/)
+ if (str.length == 0 || str =~ /\A +\z/)
return []
end
array = Array::new
@@ -1101,20 +1121,29 @@ EOM
while (node &&
(defined?(MeCab::VERSION) || (node.hasNode == 1)))
if defined?(MeCab::VERSION)
- token = node.surface
- hinshi = node.feature.split(/,/)[0]
+ token = node.surface.encode('EUC-JP', @m_dic_enc)
+ hinshi = node.feature.encode('EUC-JP', @m_dic_enc).split(/,/)[0]
else
- token = node.getSurface
- hinshi = node.getFeature.split(/,/)[0]
- end
- ## print token, hinshi, "\n"
- if (hinshi == "\xcc\xbe\xbb\xec")
- if ((token =~ Reg_kanji_katakana) || (token.length > 2))
+ token = node.getSurface.encode('EUC-JP', @m_dic_enc)
+ hinshi = node.getFeature.encode('EUC-JP', @m_dic_enc).split(/,/)[0]
+ end
+ unless (token.valid_encoding?)
+ # Scrub token
+ token = token.each_char.map { |c| (c.valid_encoding?) ? c : "" }.join
+ end
+ case hinshi
+ when "BOS/EOS"
+ # Skip BOS/EOS
+ when "\xb5\xad\xb9\xe6".force_encoding('EUC-JP')
+ # Skip KIGOU
+ when "\xcc\xbe\xbb\xec".force_encoding('EUC-JP')
+ # MEISHI
+ if ((token =~ Reg_kanji_katakana) || (token.bytesize > 2))
array.push(token)
end
else
token.gsub!(Reg_not_kanji_katakana, '')
- if ((token =~ Reg_kanji) || (token.length > 2))
+ if ((token =~ Reg_kanji) || (token.bytesize > 2))
array.push(token)
end
end
@@ -1159,12 +1188,12 @@ EOM
str.scan(Reg_kanji).each do |token|
case token.length
- when 2, 4
+ when 1, 2
tokens.push(token)
else
- l = token.length / 2 - 2
+ l = token.length - 1
for i in (0 .. l)
- tokens.push(token[i * 2, 4])
+ tokens.push(token[i, 2])
end
end
end
@@ -1209,7 +1238,7 @@ EOM
else
decoded_str = encoded_str.unpack("m*").to_s
end
- Iconv.u2eucjp(decoded_str)
+ u2eucjp(decoded_str)
else
""
end
@@ -1218,12 +1247,19 @@ EOM
else
content = latin2ascii(content)
end
+
+ unless (content.valid_encoding?)
+ # Scrub str
+ content = content.each_char.map { |c| (c.valid_encoding?) ? c : "" }.join
+ end
+
content.scan(reg_token).each do |token|
head_db.add_scalar(header, token, 1) if (token.length < 20)
@options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"])
end
if (lang == "ja")
@jtokenizer.split(content.gsub(/\s+/, '')).each do |token|
+ token.force_encoding('ASCII-8BIT')
head_db.add_scalar(header, token, 1)
@options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"])
end
@@ -1301,11 +1337,11 @@ EOM
end
def i2eucjp(i)
- Iconv.u2eucjp([i].pack("U"))
+ u2eucjp([i].pack("U"))
end
def i2ascii(i)
- latin2ascii(Iconv.u2latin([i].pack("U")))
+ latin2ascii(u2latin([i].pack("U")))
end
def i2u(i)
@@ -1313,8 +1349,9 @@ EOM
end
def decode_character_reference2u(str)
+ reg = Regexp::compile("\&\#(\d{1,5}|x[\da-f]{1,4});".force_encoding('UTF-8'), Regexp::IGNORECASE)
if (@options["utf-8"])
- newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do
+ newstr = str.gsub(reg) do
hex_or_dec = $1
if (hex_or_dec =~ /^x(.*)/i)
hex_str = $1
@@ -1324,7 +1361,7 @@ EOM
end
end
else
- newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "")
+ newstr = str.gsub(reg, "")
end
return newstr
end
@@ -1363,6 +1400,11 @@ EOM
reg_token2 = Regexp::compile('\b\d[\d\.]+\d\b|[\w%]+')
reg_noret = Regexp::compile('[\r\n]*\z')
+ unless (str.valid_encoding?)
+ # Scrub str
+ str = str.each_char.map { |c| (c.valid_encoding?) ? c : "" }.join
+ end
+
str.scan(reg_token).each do |token|
if (token =~ reg_url)
token.scan(reg_token2).each do |token2|
@@ -1378,12 +1420,17 @@ EOM
end
if (lang == "ja")
- str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+", nil, 'e'), '') # delete cite mark
- str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+", nil, 'e'), '') # delete white space
- str.gsub!(Regexp::compile("(\\r?\\n){2,}", nil, 'e'), ' ') # keep multiple newline as space
- str.gsub!(Regexp::compile("[\\r\\n]+", nil, 'e'), '') # delete newline
+# str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+", nil, 'e'), '') # delete cite mark
+ str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+".force_encoding('EUC-JP')), '')
+# str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+", nil, 'e'), '') # delete white space
+ str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+".force_encoding('EUC-JP')), '') # delete white space
+# str.gsub!(Regexp::compile("(\\r?\\n){2,}", nil, 'e'), ' ') # keep multiple newline as space
+ str.gsub!(Regexp::compile("(\\r?\\n){2,}".force_encoding('EUC-JP')), ' ') # keep multiple newline as space
+# str.gsub!(Regexp::compile("[\\r\\n]+", nil, 'e'), '') # delete newline
+ str.gsub!(Regexp::compile("[\\r\\n]+".force_encoding('EUC-JP')), '') # delete newline
str.split.each do |s|
@jtokenizer.split(s).each do |token|
+ token.force_encoding('ASCII-8BIT')
body_hash[token] += 1
@options["message-fh"].printf("tokenizer ja %s %s\n", "body", token) if (@options["debug"])
end
@@ -1450,7 +1497,7 @@ EOM
buf = buf.join.gsub(/[\r\n]/, '').unpack("m*")
end
when /quoted-printable/i
- buf.map! {|str| str.unpack("M*").to_s}
+ buf.map! {|str| str.unpack("M*").join}
end
lang_backup = lang
@@ -1469,13 +1516,13 @@ EOM
if (lang == "ja")
if (code == "utf8")
if (@options["utf-8"])
- str = Iconv.u2eucjp(str)
+ str = u2eucjp(str)
else
lang = Default_Language # can't use iconv / stop ja tokenizer
end
elsif (code == "gb18030")
if (@options["utf-8"])
- str = Iconv.gb180302eucjp(str)
+ str = gb180302eucjp(str)
else
lang = Default_Language
end
@@ -1489,10 +1536,12 @@ EOM
tags = Array::new
if (headers["content-type"] =~ /html/i)
# remove salad at head of part
+ encoding = str.encoding
+ str.force_encoding('ASCII-8BIT')
if (str =~ Regexp::compile('\A[^<>]*?(<(\?xml|!doctype|html|body)\b.*)\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n'))
str = $1
end
-
+
# remove salad in head, except style
if (str =~ /\A(.*?)(<body.*)\z/im)
before_body_tag = $1
@@ -1513,8 +1562,7 @@ EOM
str = $1
end
end
-
-
+
# remove salad after body or html
if (str =~ Regexp::compile('\A(.*)</html>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n'))
str = $1
@@ -1525,15 +1573,20 @@ EOM
str.gsub!(Regexp::compile('<[^>]*>', Regexp::MULTILINE, 'n')) do |t|
t = t.gsub(/\n/, '')
if (t =~ RE_ALL_TAGS) # end tags are thrown away
+ t.force_encoding(encoding)
tags.push(t)
end
+ t.force_encoding('ASCII-8BIT')
if (t =~ RE_SPACE_TAGS)
+ t.force_encoding(encoding)
" "
else
+ t.force_encoding(encoding)
""
end
end
+ str.force_encoding(encoding)
body_str = decode_character_reference(str, lang) # out of tags
tag_str = decode_character_reference(tags.join, lang) # in tags
else # if plain text
@@ -1621,7 +1674,7 @@ EOM
def get_combined_probability(token_db)
prob_db = TokenDB::new # temporary
- token_db.each_ct do |(category, token)|
+ token_db.each_ct do |category, token|
probability = @prob.value_with_degene(category, token)
if (probability)
prob_db.set_scalar(category, token, probability)
@@ -1822,7 +1875,7 @@ EOM
count = 0
pminus = FLOAT::new(1)
qminus = FLOAT::new(1)
- token_db.each_ct do |(category, token)|
+ token_db.each_ct do |category, token|
probability = @prob.value_with_degene(category, token) || robx
if ((probability - @center).abs > @min_dev)
if (probability <= 0.0)
@@ -2342,7 +2395,7 @@ EOM
dbs.push(db)
if (@options["pipe"])
insert_headers!(buf, (@options["add-spam"] || @options["sub-clean"]), nil)
- @options["pipe-fh"].print buf
+ @options["pipe-fh"].print buf.join
end
update_token_db_one(db)
end
@@ -2450,11 +2503,10 @@ EOM
end
end
if (@options["imap-fetch-unflagged"])
- null = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""])
yes = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "Yes"])
no = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "No"])
- @options["message-fh"].printf("imap-fetch-unflagged working original %d null %d Yes %d No %d\n",
- uids.length, null.length, yes.length, no.length) if (@options["verbose"])
+ @options["message-fh"].printf("imap-fetch-unflagged working original %d Yes %d No %d\n",
+ uids.length, yes.length, no.length) if (@options["verbose"])
## uids = uids - imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""])
## Sendmail Advanced Message Server returns all mails when search string is zero-length ???
uids = uids - yes - no
@@ -2527,6 +2579,7 @@ EOM
def socket_send_rec(command, socket)
buf = Array::new
+ b = ".\r\n"
if (command)
@options["message-fh"].printf("send %s %s", socket, command.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"])
socket.write_timeout(command) # pass command to pop-server
@@ -2602,7 +2655,7 @@ EOM
end
def pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user)
- gs = TCPserver.open(pop_proxy_if, pop_proxy_port)
+ gs = TCPServer.open(pop_proxy_if, pop_proxy_port)
addr = gs.addr
addr.shift
@options["message-fh"].printf("pop_proxy is on %s\n", addr.join(":")) if (@options["verbose"])
@@ -2612,7 +2665,7 @@ EOM
begin
pop_socket = nil
timeout(SOCKET_TIMEOUT) do
- pop_socket = TCPsocket.open(pop_server, pop_port)
+ pop_socket = TCPSocket.open(pop_server, pop_port)
end
@options["message-fh"].print(pop_socket, " is connected\n") if (@options["verbose"])
@@ -2653,7 +2706,7 @@ EOM
# don't use elsif
if (command =~ /QUIT/i)
@options["message-fh"].printf("send %s %s", pop_proxy_socket, response[0]) if (@options["debug"])
- pop_proxy_socket.write(response) # return response to MUA
+ pop_proxy_socket.write(response.join) # return response to MUA
break
elsif ((command =~ /\AUSER\s*(\S*)\r/) &&
(pop_user && pop_user != $1))
@@ -2662,7 +2715,7 @@ EOM
break
else
@options["message-fh"].printf("send %s %s", pop_proxy_socket, response[0]) if (@options["debug"])
- pop_proxy_socket.write(response) # return response to MUA
+ pop_proxy_socket.write(response.join) # return response to MUA
end
end
rescue TimeoutError
@@ -2851,7 +2904,9 @@ EOM
if (s == nil)
raise "socket.gets returned nil"
else
- return s
+ return s.force_encoding('ASCII-8BIT')
+## return s.force_encoding('US-ASCII')
+## return s.force_encoding('US-ASCII')
end
end
end
@@ -3188,10 +3243,8 @@ EOM
options["imap-auth"] = options["imap-auth"] || Default_imap_auth
options["imap-auth-preference"] = Default_imap_auth_preference # can't modify with command line option
- if ((! options["disable-utf-8"]) &&
- safe_require("iconv"))
+ if ((! options["disable-utf-8"]))
options["utf-8"] = true
- define_safe_iconv if (! defined?(Iconv.safe_iconv))
else
options["utf-8"] = false
end
@@ -3401,7 +3454,7 @@ EOM
@db_hash[token_db.language].get_combined_probability(token_db)
insert_headers!(buf, token_db.spam_flag, token_db.probability)
if (@options["pipe"])
- @options["pipe-fh"].print buf
+ @options["pipe-fh"].print buf.join
end
printf("%s\n", file) if (token_db.spam_flag && @options["list-spam"])
printf("%s\n", file) if (! token_db.spam_flag && @options["list-clean"])
diff --git a/bsfilter/bsfilter.exe b/bsfilter/bsfilter.exe
deleted file mode 100755
index fd5fb80..0000000
Binary files a/bsfilter/bsfilter.exe and /dev/null differ
diff --git a/bsfilter/bsfilter.exr b/bsfilter/bsfilter.exr
deleted file mode 100644
index 5d4e85a..0000000
--- a/bsfilter/bsfilter.exr
+++ /dev/null
@@ -1,38 +0,0 @@
-## $Id: bsfilter.exr,v 1.3 2005/03/13 05:44:24 nabeken Exp $
-## recipe for Exerb 3.2.0
-## need to specify --corefile and --outfile
-
-set_kcode none
-add_ruby_script bsfilter
-add_ruby_script getoptlong.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/getoptlong.rb
-add_ruby_script timeout.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/timeout.rb
-add_ruby_script monitor.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/monitor.rb
-add_ruby_script net/imap.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/net/imap.rb
-add_ruby_script openssl.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl.rb
-add_ruby_script openssl/bn.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl/bn.rb
-add_ruby_script openssl/buffering.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl/buffering.rb
-add_ruby_script openssl/cipher.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl/cipher.rb
-add_ruby_script openssl/digest.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl/digest.rb
-add_ruby_script openssl/ssl.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl/ssl.rb
-add_ruby_script openssl/x509.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/openssl/x509.rb
-
-add_extension_library nkf.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/nkf.so
-add_extension_library iconv.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/iconv.so
-add_extension_library socket.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/socket.so
-add_extension_library openssl.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/openssl.so
-add_extension_library sdbm.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/sdbm.so
-add_extension_library gdbm.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/gdbm.so
-add_extension_library digest.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/digest.so
-add_extension_library digest/md5.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/digest/md5.so
-
-add_extension_library Win32API.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/Win32API.so
-add_extension_library swin.so /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32/swin.so
-add_ruby_script vr/compat/rubycompat.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/compat/rubycompat.rb
-add_ruby_script vr/compat/vrcontrol.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/compat/vrcontrol.rb
-add_ruby_script vr/contrib/vrwincomponent.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/contrib/vrwincomponent.rb
-add_ruby_script vr/rscutil.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/rscutil.rb
-add_ruby_script vr/sysmod.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/sysmod.rb
-add_ruby_script vr/vrcontrol.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/vrcontrol.rb
-add_ruby_script vr/vrtray.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/vrtray.rb
-add_ruby_script vr/vruby.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/vruby.rb
-add_ruby_script vr/winconst.rb /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/vr/winconst.rb
diff --git a/bsfilter/bsfilter.exy b/bsfilter/bsfilter.exy
deleted file mode 100644
index 9413d29..0000000
--- a/bsfilter/bsfilter.exy
+++ /dev/null
@@ -1,44 +0,0 @@
-## $Id: bsfilter.exy,v 1.1 2006/12/10 05:08:43 nabeken Exp $
-## recipe for Exerb 4.0.0
-## need to specify --corefile and --outfile
-
-general:
- startup: bsfilter
- kcode: none
-
-path:
- - /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8
- - /usr/local/lib/ruby/site_ruby/1.8/exerb/1.8/i386-mswin32
-
-file:
- bsfilter:
- getoptlong.rb:
- timeout.rb:
- monitor.rb:
- net/imap.rb:
- openssl.rb:
- openssl/bn.rb:
- openssl/buffering.rb:
- openssl/cipher.rb:
- openssl/digest.rb:
- openssl/ssl.rb:
- openssl/x509.rb:
- nkf.so:
- iconv.so:
- socket.so:
- openssl.so:
- sdbm.so:
- gdbm.so:
- digest.so:
- digest/md5.so:
- Win32API.so:
- swin.so:
- vr/compat/rubycompat.rb:
- vr/compat/vrcontrol.rb:
- vr/contrib/vrwincomponent.rb:
- vr/rscutil.rb:
- vr/sysmod.rb:
- vr/vrcontrol.rb:
- vr/vrtray.rb:
- vr/vruby.rb:
- vr/winconst.rb:
diff --git a/bsfilter/bsfilterw.exe b/bsfilter/bsfilterw.exe
deleted file mode 100755
index 413d91b..0000000
Binary files a/bsfilter/bsfilterw.exe and /dev/null differ
diff --git a/htdocs/index-e.html b/htdocs/index-e.html
index b8d52b3..52e8a8f 100644
--- a/htdocs/index-e.html
+++ b/htdocs/index-e.html
@@ -12,7 +12,7 @@
</head>
<body>
-<p class="version">$Id: index-e.html,v 1.22 2008/03/02 13:04:39 nabeken Exp $</p>
+<p class="version">$Id: index-e.html,v 1.23 2013/11/03 08:40:41 nabeken Exp $</p>
<h1>bsfilter / bayesian spam filter</h1>
<p class="icon">
@@ -67,9 +67,6 @@
<h3>2.1. UNIX</h3>
<p>Install ruby interpreter. Put bsfilter/bsfilter at a directory in your executable path.
On some OSs or distributions, you may use a package like ports or ebild.</p>
-<h3>2.2. Windows</h3>
-<p>Install iconv.dll. Put bsfilter/bsfilter.exe and/or bsfilter/bsfilterw.exe at an appropriate directory.
-using bsfilter.exe is recommended on command-prompt.</p>
<h2><a id="concept">3. How bsfilter works?</a></h2>
<h3>3.1. using spam proability of each token</h3>
diff --git a/htdocs/index.html b/htdocs/index.html
index 0608fa1..712a3c2 100644
--- a/htdocs/index.html
+++ b/htdocs/index.html
@@ -11,7 +11,7 @@
</head>
<body>
-<p class="version">$Id: index.html,v 1.40 2008/03/02 13:04:39 nabeken Exp $</p>
+<p class="version">$Id: index.html,v 1.41 2013/11/03 08:40:41 nabeken Exp $</p>
<h1>bsfilter / bayesian spam filter / �٥������� ���ѥ� �ե��륿</h1>
<p class="icon">
@@ -69,9 +69,6 @@
<h3>2.1. UNIX�Ϥξ��</h3>
<p>ruby�����ץ�ȡ��뤷�����������֤����bsfilter/bsfilter��PATH���̤äƤ���Ŭ���ʥǥ��쥯�ȥ���֤���
OS���ǥ����ȥ�ӥ塼�����ˤ�äƤϡ�ports��ebuild���Υѥå��������Ѱդ���Ƥ�����⤢�롣</p>
-<h3>2.2. Windows��</h3>
-<p>iconv.dll�ȡ��뤷�����������֤����bsfilter/bsfilter.exe, bsfilter/bsfilterw.exe��Ŭ���ʥǥ��쥯�ȥ���֤���
-���ޥ�ɥץ��ץȤ����bsfilter.exe������ʳ������bsfilterw.exe����Ѥ���Τ������ᡣ</p>
<h2><a id="concept">3. ���ä��ꡢ�ɤ��ʤäƤ����?</a></h2>
<h3>3.1. ñ��(token)��spam��Ψ����Ƚ�ꤹ��</h3>
@@ -516,28 +513,7 @@ taro at imap.example.com> list
</pre>
<h3>Q. windows�ǻȤ������Τ�����?</h3>
-<p>mswin32�ǡ�cygwin����������Ruby�����ȡ���Ѥߤξ��ϡ�bsfilter�Υ����������̤˼¹Ԥ���Ф褤��
-Ruby�����ȡ��뤵��Ƥ��ʤ����ϡ�bsfilter.exe��bsfilterw.exe����Ѥ���Τ���ñ��iconv.dll������ɬ�ס�</p>
-<ul>
-<li>���ޥ�ɥץ��ץȤǡ��ǡ����١����δ�������Ԥ����ϡ�bsfilter.exe�������ᡣ</li>
-<li>�������ȥ��åפ�pop proxy�Ȥ���ư�������ϡ�bsfilterw.exe�������ᡣ
---verbose, --debug, --show-new-token������å���������Ϥ��륪�ץ����ϻ��ѤǤ��ʤ���
-���ץ������ʤ��ǵ�ư����ȡ�ɸ�����Ϥ�������ɤ⤦�Ȥ��ƥ��顼����Τϻ��͡�</li>
-</ul>
-<p><a href="http://nadmin.org/howto/bsfilter-on-windows.html">bsfilter������(Windows��)</a>���ܤ�����</p>
-
-<h3>Q. �ɤ�����iconv.dll�����ꤹ��Ф褤?</h3>
-<p><a href="http://www.rubyist.net/~nobu/t/20051108.html#p01">�ʤ��������Ĵ��</a>�ˤ��ȡ�
-ruby���Τ�iconv.dll���ȹ礻�����¤����롣�����Ȥ��Ƥ���ruby���Ф��Ƥϰʲ��Τ褦�ˤʤ�Ϥ���</p>
-<dl>
-<dt><a href="http://www.kaoriya.net/#LIBICONV">KaoriYa.Net</a>�����ۤ���Ƥ���Libiconv DLL 1.9.1 for Windows</dt>
-<dd>OK</dd>
-<dt><a href="http://www.kaoriya.net/#LIBICONV">KaoriYa.Net</a>�����ۤ���Ƥ���Libiconv DLL 1.10-20051016 for Windows</dt>
-<dd>NG�����ޤ�"[BUG] rb_sys_fail(iconv) - errno == 0"�Ȥ�����å�����������뤫��</dd>
-<dt><a href="http://sylpheed.good-day.net/">Sylpheed</a>�ǻ��Ѥ��Ƥ�����</dt>
-<dd>OK��Win32�Ǥ�Ʊ������Ƥ����Τ䡢<a href="http://sylpheed.good-day.net/sylpheed/win32/">http://sylpheed.good-day.net/sylpheed/win32/</a>
-��libiconv���</dd>
-</dl>
+<p>mswin32�ǡ�cygwin����������Ruby�����ȡ���Ѥߤξ��ϡ�bsfilter�Υ����������̤˼¹Ԥ���Ф褤��</p>
<h3>Q. exit status�ϤɤΤ褦�ˤʤäƤ���?</h3>
<p>�̾��0��--pipe���ץ����ʤ���Ƚ���оݥ���ɸ�����Ϥ���Ϳ�������Τߡ�Ƚ���̤�exit status�������뤳�Ȥ�����롣
diff --git a/htdocs/mew.html b/htdocs/mew.html
index 2bfafdc..ef8d58c 100644
--- a/htdocs/mew.html
+++ b/htdocs/mew.html
@@ -11,7 +11,7 @@
</head>
<body>
-<p class="version">$Id: mew.html,v 1.5 2006/01/14 07:37:38 nabeken Exp $</p>
+<p class="version">$Id: mew.html,v 1.7 2012/06/17 06:57:11 nabeken Exp $</p>
<h1>bsfilter with mew</h1>
<p class="icon">
@@ -24,6 +24,26 @@
</p>
<p><a href="index.html">index</a></p>
+
+<h2>bsfilter��mew version 5�ʹߤ���Ȥ�</h2>
+<p>bsfilter����Ѥ���褦��mew��spam�ط��������Ԥ���</p>
+
+<h3>���ȡ���</h3>
+<ul>
+<li>mew�ΥС������˹�碌��mua/mew{5, 6, 6.4}/mew.el��~/.emacs.el�ʤɤ��ɲä��롣</li>
+</ul>
+
+<h3>�Ȥ���</h3>
+<p>mew����å��������������ݡ�����MTA, POP proxy���ˤ��X-Spam-Flag�إå����դ��Ƥ�����ˤϡ�spam�˼�ưŪ��"D"�ޡ������դ���</p>
+<p>summary�⡼�ɤǰʲ������ѤǤ��롣</p>
+<dl>
+<dt>lh</dt><dd>learn-ham�����ߤΥ�å�������clean�Ȥ��Ƴؽ����롣</dd>
+<dt>ls</dt><dd>learn-spam�����ߤΥ�å�������spam�Ȥ��Ƴؽ����롣</dd>
+<dt>lm</dt><dd>��������Υᥤ���ؽ���Ƚ�ꡣspam�ˤ�"*"�ޡ������դ���(mew 6.4�ʹ�)</dd>
+<dt>bm</dt><dd>��������Υᥤ���ؽ���Ƚ�ꡣspam�ˤ�"*"�ޡ������դ���(mew 6.3����)</dd>
+</dl>
+
+
<h2>bsfilter��mew version 4����Ȥ�</h2>
<p>bsfilter����Ѥ���褦��mew��spam�ط��������Ԥ���</p>
diff --git a/mua/mew6.4/mew.el b/mua/mew6.4/mew.el
new file mode 100644
index 0000000..9c2df24
--- /dev/null
+++ b/mua/mew6.4/mew.el
@@ -0,0 +1,50 @@
+;; $Id: mew.el,v 1.2 2012/06/17 06:57:31 nabeken Exp $
+
+; moved from .emacs.el
+(setq mew-spam: "X-Spam-Flag:")
+
+; put "D"
+(defun mew-spam-bsfilter (val)
+ (let ((case-fold-search t))
+ (if (string-match "yes" val) ?D)))
+
+; put "o +sapm" at inc
+;(defun mew-spam-bsfilter (val)
+; (let ((case-fold-search t))
+; (if (string-match "yes" val) "+spam")))
+
+(setq mew-inbox-action-alist
+ '(("X-Spam-Flag:" mew-spam-bsfilter)))
+
+; for "ls" (learn-spam)
+(setq mew-spam-prog "bsfilter")
+(setq mew-spam-prog-args '("-C" "-s" "-u"))
+
+; for "lh" (learn-ham)
+(setq mew-ham-prog "bsfilter")
+(setq mew-ham-prog-args '("-c" "-S" "-u"))
+
+; for "lm" (mark-spam)
+(define-key mew-summary-mode-map "lm" 'mew-summary-bsfilter-mark-region)
+
+(defun mew-summary-bsfilter-mark-region (&optional arg)
+ "study/judge the region and put the '*' mark onto spams.
+need to re-learn if judgment of bsfilter is wrong"
+ (interactive "P")
+ (mew-pickable
+ (mew-summary-with-mewl
+ (let* ((folder (mew-summary-physical-folder))
+ (msgs (mew-summary-pick-msgs folder t))
+ (prog "bsfilter")
+ (opts '("-a" "--list-spam"))
+ (pattern nil))
+ (setq msgs (mew-summary-pick-with-grep prog opts pattern folder msgs))
+ (mew-summary-pick-ls folder msgs)))))
+
+; show X-Spam-Flag and X-Spam-Probability in message buffer
+(setq mew-field-spec
+ (reverse (append (list (car (reverse mew-field-spec)))
+ '(("^X-Spam-Probability:$" t)
+ ("^X-Spam-Flag:$" t))
+ (cdr (reverse mew-field-spec)))))
+
diff --git a/test/test.rb b/test/test.rb
index 47f10fa..d720d5b 100644
--- a/test/test.rb
+++ b/test/test.rb
@@ -1,5 +1,5 @@
-## -*-Ruby-*- $Id: test.rb,v 1.9 2007/02/12 06:01:44 nabeken Exp $
-## this file is written in eucJP
+# -*- coding: euc-jp -*-
+# -*-Ruby-*- $Id: test.rb,v 1.13 2013/11/03 08:26:42 nabeken Exp $
load '../bsfilter/bsfilter'
require 'test/unit'
@@ -16,6 +16,8 @@ class DummyFH
end
def print(*arg)
@buf.push(*arg.flatten.dup)
+ @buf.map{|str| str.force_encoding('ASCII-8BIT')}
+ @buf = @buf.join.split(/(\r\n|\r|\n)/).each_slice(2).to_a.map{|s| s.join}
end
def printf(format, *args)
@buf.push(sprintf(format, *args))
@@ -30,7 +32,11 @@ class Bsfilter
end
def grep_message(pattern)
- options["message-fh"].buf.grep(pattern)
+ if RUBY_VERSION < "1.9"
+ options["message-fh"].buf.grep(pattern)
+ else
+ options["message-fh"].buf.map{|str| str.force_encoding('EUC-JP')}.grep(pattern)
+ end
end
def count_message(pattern)
@@ -38,7 +44,11 @@ class Bsfilter
end
def grep_pipe(pattern)
- options["pipe-fh"].buf.grep(pattern)
+ if RUBY_VERSION < "1.9"
+ options["pipe-fh"].buf.grep(pattern)
+ else
+ options["pipe-fh"].buf.map{|str| str.force_encoding('EUC-JP')}.grep(pattern)
+ end
end
def count_pipe(pattern)
@@ -202,6 +212,7 @@ class TestMultipleInstances < Test::Unit::TestCase
def test_by_jtokenizer
return if (! safe_require('MeCab'))
+ return if (! safe_require('chasen.o'))
@files = ["testcases/iso_2022_jp_plain"]
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ruby-extras/bsfilter.git
More information about the Pkg-ruby-extras-commits
mailing list