[Debtags-devel] Second Preview
Benjamin Mesing
bensmail@gmx.net
Thu, 14 Oct 2004 20:38:59 +0200
--=-8GVCQIZi1oGXKlKrBXUx
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
Hello,
my deepest apologies for having sent out another buggy version :-(
I will correct this now. Here is yet another bayesian-tagger. I have
also incorporated maintainers and section in this version.
Greetings Ben
--=-8GVCQIZi1oGXKlKrBXUx
Content-Disposition: attachment; filename=bayesian-tagger.pl
Content-Type: application/x-perl; name=bayesian-tagger.pl
Content-Transfer-Encoding: 7bit
#!/usr/bin/perl -w
=head1 DESCRIPTION
=head1 METHODS
=over 4
=cut
use DB_File;
use warnings;
use strict;
use AptPkg::Cache;
# CPAN modules
use Heap::Priority;
use Getopt::Long;
use File::Spec;
Package::init();
my $usage =
"Usage:
bayesian-tagger.pl options tagname
Options:
--verbose|-v toggle verbose mode, this may be given more than once
--no-train|-nt do not train, only test (e.g. the database was allready filled)
--test-package|-p packageName do not test the files but the given package
";
my $noTrain=0;
$main::verbose=0;
my $package;
if ( !GetOptions(
"verbose|v+" => \$main::verbose,
"no-train|nt" => \$noTrain,
"test-package|p=s" => \$package
)
|| (@ARGV != 1)
)
{
die $usage;
}
my ($tag) = @ARGV;
my $dirFromTag = $tag;
$dirFromTag =~ s/:/_/g; # remove all : from the tag and replace by an _
my $filter = NaiveBayesianFilter->new( Tokenizer->new(), $dirFromTag );
my $matches = 0;
my $mismatches = 0;
my $badMatches = 0;
my $goodMismatches = 0;
# if a single package shall be tested
if ($package)
{
my ( $good, $posterior ) = categorizePackage($filter, $package);
print "Package $package was categorized as ".($good ? "good" : "bad"). " with a ";
print "posterior to be good of $posterior\n";
}
else
{
my $badFile = "$dirFromTag/bad.list";
my $goodFile = "$dirFromTag/good.list";
my @goodPackages = readPackagesFromFile($goodFile);
my @badPackages = readPackagesFromFile($badFile);
my $halfGoodNumber = @goodPackages/2;
my $halfBadNumber = @badPackages/2;
my $i=0;
my $j=0;
if (!$noTrain)
{
for ( ; $i<$halfGoodNumber; ++$i)
{
trainPackage($filter, $goodPackages[$i], 1);
}
for ( ; $j<$halfBadNumber; ++$j)
{
trainPackage($filter, $badPackages[$j], 0);
}
$filter->save();
}
my $goodTests = scalar(@goodPackages) - $i;
my $badTests = scalar(@badPackages) - $j;
for ( ; $i<=$#goodPackages;++$i)
{
print("Testing $goodPackages[$i]\n") if ($main::verbose);
testPackage($filter, $goodPackages[$i], 1);
}
for ( ; $j<=$#badPackages;++$j)
{
print("Testing $badPackages[$j]\n") if ($main::verbose);
testPackage($filter, $badPackages[$j], 0);
}
print "\n";
print "Tested packages: ".($goodTests+$badTests)."\n";
print "Expected to be good: $goodTests\n";
print "Expected to be bad: $badTests\n";
print "Matches: $matches ^= " . $matches/($matches+$mismatches) . "\n";
print "Mismatches: $mismatches ^= " . $mismatches/($matches+$mismatches) . "\n";
print "Expected good, but wielded bad: $goodMismatches ^= " . $goodMismatches/scalar($goodTests) . "\n";
print "Expected bad, but wielded good: $badMatches ^= " . $badMatches/scalar($badTests) . "\n";
}
## This trains the given filter with the given package
##
## @param $filter the filter to train
## @param $packageName the name of package to be used for training
## @param $good if the package is good (1) or bad (0)
sub trainPackage()
{
my ($filter, $packageName, $good) = @_;
my $package = Package->new($packageName);
print "Adding $packageName\n" if ($main::verbose);
if ($good)
{
$filter->addGoodEntry($package->getCategorizerText());
}
else
{
$filter->addBadEntry($package->getCategorizerText());
}
}
## @brief This tests the package and compares with the given value.
##
## It complains on STDOUT if the test and the checker do no match.
## In verbose mode it informs the user about matches too.
## @param $filter the filter to use for categorizing
## @param $packageName the name of package to be used for testing
## @param $expected if the package is expected to be good (1) or bad (0)
sub testPackage
{
my ($filter, $packageName, $expected) = @_;
my ($good, $posterior) = categorizePackage($filter, $packageName);
if ( $good != $expected)
{
++$mismatches;
if ($expected)
{
print "BAD: good package $packageName did not match!\n";
++$goodMismatches;
}
else
{
print "BAD: bad package $packageName did match!\n";
++$badMatches;
}
}
else
{
++$matches;
if ($main::verbose)
{
if ($expected)
{
print "GOOD: good package $packageName matched\n";
}
else
{
print "GOOD: bad package $packageName did not match\n";
}
}
}
}
## This trains the given filter with the given package
##
## @param $filter the filter to use for categorizing
## @param $packageName the name of package to be used for categorizing
## @returns if the package is good (0) or bad (1)
sub categorizePackage()
{
my ($filter, $packageName) = @_;
my $package = Package->new($packageName);
my ($good, $posterior) = $filter->testEntry($packageName."\n".$package->getCategorizerText());
print "Categorized $packageName as ".(($good) ? "good\n" : "bad\n") if ($main::verbose>2);
return ($good, $posterior);
}
## Returns the list of packages in the handed file.
##
## The file must contain one package per line only.
## @returns the packages as an array
sub readPackagesFromFile
{
my ($file) = @_;
open FILE, $file || die "Could not open file $file for reading.";
my @result;
while (<FILE>)
{
chomp($_);
push(@result, $_);
}
return @result;
}
#foreach (@packages)
#{
# print $package->getName()."\n".$package->getCategorizerText()."\n";
# print "-" x 80 . "\n";
#}
#my $tagDB = BayesDB->new($tag);
#$tagDB->printTagName();
#
#$tagDB->addGoodToken("Perl");
#$tagDB->addGoodToken("Perl");
#$tagDB->addGoodToken("Viagra");
#print "Token Perl: ".$tagDB->goodTokenCount("Perl")."\n";
#print "Token Viagra: ".$tagDB->goodTokenCount("Viagra")."\n";
#print "Token test: "."\n" if (defined($test));
#$tagDB->close();
package NaiveBayesianFilter;
sub min
{
return ($_[0]<$_[1]) ? $_[0] : $_[1];
}
sub max
{
return ($_[0]>$_[1]) ? $_[0] : $_[1];
}
=item C<new> I<tokenizer>
This creates a bayesian filter, which can be trained and
asked if a given entry is good or bad.
B<Parameter>
=over 6
=item tokenizer
the tokenizer to be used
=item directory
The directory where to store the data.
=back
=cut
sub new
{
shift @_;
my ($tokenizer, $directory) = @_;
my $instance = { tokenizer => $tokenizer ,
directory => $directory,
goodEntries => 0,
badEntries => 0,
goodDb => TokenDB->new( File::Spec->catdir($directory,"good.db") ),
badDb => TokenDB->new( File::Spec->catdir($directory,"bad.db") ),
# from which treshhold an entry should be considered good
goodThreshold => 0.9,
# how many words should be considered for categorization
considerations => 15,
# Maps the tokens to their likelyhood values
tokenToLikelihood => {},
};
my $countFile = File::Spec->catdir($directory, "countfile");
if ( -e $countFile) # if we loaded an existing database
{
open(COUNT_FILE, $countFile) || die "Could not open $countFile for reading\n";;
$instance->{goodEntries} = <COUNT_FILE>;
chomp($instance->{goodEntries});
$instance->{badEntries} = <COUNT_FILE>;
chomp($instance->{badEntries});
close(COUNT_FILE);
print "Loaded $instance->{goodEntries} good and $instance->{badEntries} bad messages.\n"
if ($main::verbose);
}
bless $instance;
return $instance;
}
sub save()
{
my ($this) = @_;
my $countFile = File::Spec->catdir($this->{directory}, "countfile");
open(COUNT_FILE, "> $countFile") || die "Could not open $countFile for reading\n";
print COUNT_FILE "$this->{goodEntries}\n";
print COUNT_FILE "$this->{badEntries}\n";
close(COUNT_FILE);
}
=item C<addGoodEntry> I<entry>
Adds the given entry to the good ones.
=cut
sub addGoodEntry
{
my $this = shift @_;
my $entry = shift(@_); # do not copy the string, get a reference
$this->{goodDb}->addTokens($this->{tokenizer}->tokenize($entry));
$this->incrementGood();
$this->{tokenToLikelihood} = {};
}
=item C<addBadEntry> I<entry>
Adds the given entry to the bad ones.
=cut
sub addBadEntry
{
my $this = shift @_;
my $entry = shift(@_); # do not copy the string, get a reference
$this->{badDb}->addTokens($this->{tokenizer}->tokenize($entry));
$this->incrementBad();
$this->{tokenToLikelihood} = {};
}
## Tests if the given entry is good or bad.
##
## @param entry the entry to be tested
## @returns two values: if the entry was good as first, and its $posterior
## as second value
sub testEntry
{
my $this = shift @_;
my $entry = shift(@_); # do not copy the string, get a reference
my @tokens = $this->{tokenizer}->tokenize($entry);
#foreach (@tokens)
#{
my $posterior = $this->calculatePosterior(@tokens);
print "Good posterior: ".$posterior."\n" if ($main::verbose);
#}
return ($posterior > $this->{goodThreshold}), $posterior;
}
sub calculatePosterior
{
my $this = shift @_;
my @tokens = @_;
# here the items will be stored, where those with the likelihood
# with the highest difference to 0.5 will appear first
my $pQueue = Heap::Priority->new();
my %set;
foreach my $token (@tokens)
{
# if the token was allready used
next if ( exists($set{$token}) );
$set{$token} = 1; # mark this token as used
# TODO we can spare the token name here, but for debug purposes we keep them
my $lh = $this->getLikelihood($token);
$pQueue->add([$token, $lh], abs(0.5-$lh));
}
# Paul Grahams formula a little optimized for numeric operations looks like this:
# 1/(1 + (1-a)/a * (1-b)/b * (1-c)/c *...* (1-n)/*n )
my $factor = 1;
for (my $i=0; $i < $this->{considerations}; ++$i)
{
# last if (!$pQueue->count()); # only available for SPARK
my $pair = $pQueue->pop();
last if (!defined($pair));
my ($token, $lh) = @$pair;
print "Posterior: ($token, $lh)\n" if ($main::verbose > 1);
$factor *= (1-$lh)/$lh;
}
return 1/(1 + $factor);
}
=item C<getLikelihood> I<token>
This returns the likelihood for the given token.
The results is cached in tokenToLikelihood.
=cut
sub getLikelihood
{
my ($this, $token) = @_;
my $lh = $this->{tokenToLikelihood}{$token};
if ( !defined($lh) )
{
$lh = $this->calculateLikelihood($token);
$this->{tokenToLikelihood}{$token} = $lh;
}
return $lh;
}
sub calculateLikelihood
{
my ($this, $token) = @_;
my $goodCount = $this->{goodDb}->tokenCount($token);
my $badCount = $this->{badDb}->tokenCount($token);
return 0.4 if ($goodCount == 0 && $badCount == 0);
my $goodRatio = 0.01;
my $badRatio = 0.01;
if ($goodCount)
{
$goodRatio = max(0.01, min($goodCount/$this->{goodEntries}, 0.99) );
}
if ($badCount)
{
$badRatio = max(0.01, min( $badCount/$this->{badEntries} , 0.99) );
}
return $goodRatio/($goodRatio+$badRatio); # likelihood
}
## @brief This calculate the prior probability of stuff belonging
## to the good set
## @note Currently this divides the number of good entries
## by the number of bad ones, so your training set should better
## be representative.
## private function
sub calculatePrior()
{
my ($this) = @_;
$this->{prior} = $this->{goodEntries} / ($this->{goodEntries} + $this->{badEntries});
}
sub incrementGood()
{
my ($this) = @_;
++$this->{goodEntries};
$this->calculatePrior();
}
sub incrementBad()
{
my ($this) = @_;
++$this->{badEntries};
$this->calculatePrior();
}
package TokenDB;
## This creates a database of simple string tokens.
##
## @param $filename the name of the file of this database
sub new
{
shift @_;
my ($filename) = @_;
my $instance = { dbFile => {} , # holds the array which is used to store information
# dbHandle => 0, # holds a handle to the to the BerkleyDB API object
};
bless $instance;
#$instance->{dbHandle} =
tie( %{$instance->{dbFile}}, "DB_File", "$filename");
print "Loaded ".scalar(keys %{$instance->{dbFile}})." tokens from $filename\n" if ($main::verbose);
return $instance;
}
## @brief This is the destructor which closes the DB.
sub close
{
my ($this) = @_;
# undef $this->{dbHandle}; # allow us to untie dbFile
untie %{$this->{dbFile}};
}
## @brief Adds a single token to the database.
sub addToken
{
my ($this, $token) = @_;
if ($this->{dbFile}{$token})
{
++$this->{dbFile}{$token};
}
else
{
$this->{dbFile}{$token} = 1;
}
}
## Adds an array of tokens to the database.
sub addTokens
{
my $this = shift(@_);
foreach (@_)
{
$this->addToken($_);
}
}
## @brief Returns how often the given token is counted in the
## database.
##
## @param $token the name of the token to get the information for
##
sub tokenCount
{
my ($this, $token) = @_;
my $count = $this->{dbFile}{$token};
return defined($count) ? $count : 0;
}
package Tokenizer;
## @brief Creates this tokenizer. It is a very simple one, splitting
## only on some special characters.
##
sub new
{
shift @_;
my $instance = { discards => [] # the words to be discarded from the token
# stream because they are too common
};
bless $instance;
$instance->initDiscards();
return $instance;
}
## private function
sub initDiscards
{
my ($this) = @_;
my @tmpArray = qw( a also an and are at be by can for from it in is on of the this to with you);
$this->{discards} = \@tmpArray;
}
## @brief This tokenizes the handed string returning an array of
## all tokens recognized in order of appearance
## @param $string the string to tokenize
## @returns an array of all tokens found
sub tokenize
{
my $this = shift @_;
# currenlty the very simplest thing, splitting by ws,
# we can dive into greater detail later
my @tokens = split(/[\s:\.\(\)\*,;]+/ , $_[0]); # split by everything possible
my @resultTokens;
foreach my $token (@tokens)
{
$token = lc($token);
next if (!$token); # if the token is an empty string TODO can we prevent this with some nifty split parameters?
# if the token is uninteresting
# TODO search using binary search here
next if ( grep( ($_ eq $token), @{$this->{discards}} ) );
push @resultTokens, $token;
}
return @resultTokens;
}
package Package;
my $cache;
sub init
{
$cache = AptPkg::Cache->new();
}
sub new
{
shift @_;
my ($name) = @_;
my $instance = { name => $name,
description => undef,
depends => undef,
provides => undef,
tags => undef,
maintainer => undef,
section => undef,
};
bless $instance;
return $instance;
}
sub getCategorizerText
{
my ($this) = @_;
my $text = "$this->{name}\n";
# depends will be \% and provides \@
my ($depends, $provides) = $this->getDepends();
$text .= join("\n", keys(%$depends))."\n";
$text .= join("\n", @$provides)."\n";
$text .= $this->getMaintainer();
$text .= $this->getDescription();
# print(("-" x 80) . "\n$text\n" . ("-"x 80). "\n");
}
## @brief Returns the description of the package.
##
## This value is lazy initialized, i.e. after being requested
## once, it will be accessible fast (and consume you precious memory).
sub getDescription
{
my ($this) = @_;
if ( !defined($this->{description}) )
{
$this->fillPackageInformation();
# return if (!$package->{Section});
}
return $this->{description};
}
## Returns the name of this package
sub getName
{
return $_[0]->{name};
}
## Returns the depends for the given package mapped to the type of
## dependencies (Depend, PreDepend,...) as reference and a second reference
## to the provides of the package.
##
## This value is lazy initialized, i.e. after being requested
## once, it will be accessible fast (and consume you precious memory).
sub getDepends
{
my ($this) = @_;
if ( !defined($this->{depends}) )
{
$this->fillPackageInformation();
}
return ( $this->{depends}, $this->{provides} );
}
## Returns the depends for the given package mapped to the type of
## dependencies (Depend, PreDepend,...) as reference and a second reference
## to the provides of the package.
##
## This value is lazy initialized, i.e. after being requested
## once, it will be accessible fast (and consume you precious memory).
sub getMaintainer
{
my ($this) = @_;
if ( !defined($this->{maintainer}) )
{
$this->fillPackageInformation();
}
return $this->{maintainer};
}
sub getSection
{
my ($this) = @_;
if ( !defined($this->{section}) )
{
$this->fillPackageInformation();
}
return $this->{section};
}
## @brief Fills the package structure with the information from the database.
##
## @post maintainer, description, section, depends and provides are not undef
sub fillPackageInformation
{
my ($this) = @_;
my $pkgRecord = $cache->packages()->lookup($this->{name});
my $package = $cache->{$this->{name}};
{ # fill depends and provides sections
my %depends = ();
my @provides = ();
# returns a reference to an array of Version packages
my $versionList = $package->{VersionList};
if ( $versionList && scalar( @$versionList) )
{
# take the first entry from the list (which should be the newest version)
my $version = ${@{$versionList}}[0];
# iterate over all dependencies
# brr... I hate perls oo syntax, you get really mad with it!
if ($version->{DependsList})
{
foreach my $depend ( @{$version->{DependsList}} )
{
$depends{$depend->{TargetPkg}->{Name}} = $depend->{DebType};
}
}
if ($version->{ProvidesList})
{
foreach my $provide ( @{$version->{ProvidesList}} )
{
push @provides, $provide->{Name};
}
}
}
else
{
print "No information available for package $this->{name}\n"
if ($main::verbose);
}
$this->{depends} = \%depends;
$this->{provides} = \@provides;
}
$this->{maintainer} = ${%{$pkgRecord}}{Maintainer};
$this->{section} = ${%{$pkgRecord}}{Section};
$this->{section} = "" if ( !defined($this->{section}) );
$this->{description} = ${%{$pkgRecord}}{LongDesc};
}
# my ($patches) = @_;
# my $recs = $cache->packages();
# for my $name (keys %$cache)
# {
# my $pkg = $cache->{$name};
#
# # Skip non-packages
# next if not $pkg->{Section};
#
# # Infer from regexps matching package name
# for my $rec (@infer_names_data)
# {
# if ($name =~ $rec->[0])
# {
# #print "INFER NAME $name: ", join(', ', @{$rec->[1]{add}}), "\n";
# do_action($patches, $name, $rec->[1]);
# }
# }
#
# # Infer from package section
# if (exists $infer_section_data{$pkg->{Section}})
# {
# my $action = $infer_section_data{$pkg->{Section}};
# do_action($patches, $name, $action);
# }
#
#
# # Infer from reverse dependencies
# if (exists $infer_deps_data{$name})
# {
# my $action = $infer_deps_data{$name};
#
# # Iterate reverse dependencies
# my $revdeps = $pkg->{RevDependsList};
# for my $r (@$revdeps)
# {
# next if $r->{DepType} != 1;
#
# #print "INFER DEPS ", $r->{ParentPkg}{Name}, ": ", join(', ', @{$action->{add}}), "\n";
# do_action($patches, $r->{ParentPkg}{Name}, $action);
# }
# }
#
# # TODO: use keywords in descriptions
# # works for technologies and protocols
# # TODO: use libbow to extract keywords from package descriptions
# # TODO: use popcon data to give a frequency estimate by aggregating values
# # in a scale (linear or logaritmic?)
# # TODO: use debram data
# }
#end of method list
=back
=cut
__END__
--=-8GVCQIZi1oGXKlKrBXUx--