[Debtags-devel] First Preview

Thu, 14 Oct 2004 17:08:58 +0200

--=-INbXfaqSIogeBte/tUki
Content-Type: text/plain
Content-Transfer-Encoding: 7bit

Hello Erich

> i tried running your app (replacing Heap:: with Heap::Binary, because
> that one is available as debian package...) but i only run into the
> following error:
Ah.., I will switch to this soon, I really prefer to use debian
packages.

> Can't use an undefined value as an ARRAY reference at
> ./bayesian-tagger.pl line 646, <FILE> line 11175.
Thanks, I hope I have corrected this and attached the new version.
Additionally the -p option does work now, so single packages could be
tested. So after training you can test a single package now like this:
	bayesian-tagger.pl -p debtags uitoolkit::qt

> Did you experiment with choosing only the n most significant words?
> IIRC this is a trick that improved match quality in spam filtering.
> They sort word scores by how far they are off 0.5, and take the top
> 10. Then they multiply these values (i think) to get the final score.
I choose the 15 most significant words. I fully follow the technique
proposed by Paul Graham in his "Plan for Spam"
(http://www.paulgraham.com/spam.html).
Simply mutiplying all scores would nearly result in a good hit as a
single 0.1 would definetely reduce the score to a maximum of 0.1.

> I think you should only consider packages with a score < 0.2 or > 0.8
> to be classified automatically by the bayes tagger. 
> Otherwise the decision is "unknown".
> We will have to handle "unknown" cases anyway.
Well this is no problem, but as I said the naive bayesian approach
results very seldom in scores around 0.5 either it wields something near
0 or near one - this is what is often critizied about it.

Greetings Ben

--=-INbXfaqSIogeBte/tUki
Content-Disposition: attachment; filename=bayesian-tagger.pl
Content-Type: application/x-perl; name=bayesian-tagger.pl
Content-Transfer-Encoding: 7bit

#!/usr/bin/perl -w

# TODO save the count of good and bad for reloading

=head1 DESCRIPTION

=head1 METHODS

=over 4

=cut

use DB_File;
use warnings;
use strict;

use AptPkg::Cache;

# CPAN modules
use Heap::Priority;
use Getopt::Long;
use File::Spec;

Package::init();

my $usage = 
"Usage:
	bayesian-tagger.pl options tagname
	Options:
		--verbose|-v toggle verbose mode, this may be given more than once
		--no-train|-nt do not train, only test (e.g. the database was allready filled)
		--test-package|-p packageName do not test the files but the given package
";

my $noTrain=0;
$main::verbose=0;
my $package;

if ( !GetOptions(
		"verbose|v+" => \$main::verbose,
		"no-train|nt" => \$noTrain,
		"test-package|p=s" => \$package
	) 
	|| (@ARGV != 1) 
)
{
	die $usage;
}
my ($tag) = @ARGV;

my $dirFromTag = $tag;
$dirFromTag  =~ s/:/_/g;	# remove all : from the tag and replace by an _

my $filter = NaiveBayesianFilter->new( Tokenizer->new(), $dirFromTag );

# if a single package shall be tested
if ($package)
{
	my ( $good, $posterior ) = categorizePackage($filter, $package);
	print "Package $package was categorized as ".($good ? "good" : "bad"). " with a ";
	print "posterior to be good of $posterior\n";
}
else
{
	my $badFile = "$dirFromTag/bad.list";
	my $goodFile = "$dirFromTag/good.list";

	my @goodPackages = readPackagesFromFile($goodFile);
	my @badPackages = readPackagesFromFile($badFile);

	my $halfGoodNumber = @goodPackages/2;
	my $halfBadNumber = @badPackages/2;

	my $i=0;
	my $j=0;

	if (!$noTrain)
	{
		for ( ; $i<$halfGoodNumber; ++$i)
		{
			trainPackage($filter, $goodPackages[$i], 1);
		}
		for ( ; $j<$halfBadNumber; ++$j)
		{
			trainPackage($filter, $badPackages[$j], 0);
		}
		$filter->save();
	}

	my $matches = 0;
	my $mismatches = 0;
	my $badMatches = 0;
	my $goodMismatches = 0;

	my $goodTests = scalar(@goodPackages) - $i;
	my $badTests = scalar(@badPackages) - $j;

	for ( ; $i<=$#goodPackages;++$i)
	{

		print("Testing $goodPackages[$i]\n") if ($main::verbose);
		testPackage($filter, $goodPackages[$i], 1);
	}

	for ( ; $j<=$#badPackages;++$j)
	{
		print("Testing $badPackages[$j]\n") if ($main::verbose);
		testPackage($filter, $badPackages[$j], 0);
	}

	print "\n";
	print "Tested packages: ".($goodTests+$badTests)."\n";
	print "Expected to be good: $goodTests\n";
	print "Expected to be bad: $badTests\n";

	print "Matches: $matches ^= " . $matches/($matches+$mismatches) . "\n";
	print "Mismatches: $mismatches ^= " . $mismatches/($matches+$mismatches) . "\n";
	print "Expected good, but wielded bad: $goodMismatches ^= " . $goodMismatches/scalar($goodTests) . "\n";
	print "Expected bad, but wielded good: $badMatches ^= " . $badMatches/scalar($badTests) . "\n";
}

## This trains the given filter with the given package
##
## @param $filter the filter to train
## @param $packageName the name of package to be used for training
## @param $good if the package is good (1) or bad (0)
sub trainPackage()
{
	my ($filter, $packageName, $good) = @_;
	my $package = Package->new($packageName);
	print "Adding $packageName\n" if ($main::verbose);
	if ($good)
	{
		$filter->addGoodEntry($package->getCategorizerText());
	}
	else
	{
		$filter->addBadEntry($package->getCategorizerText());
	}
}

## @brief This tests the package and compares with the given value.
## 
## It complains on STDOUT if the test and the checker do no match.
## In verbose mode it informs the user about matches too.
## @param $filter the filter to use for categorizing
## @param $packageName the name of package to be used for testing
## @param $expected if the package is expected to be good (1) or bad (0)
sub testPackage
{
	my ($filter, $packageName, $expected) = @_;
	if (categorizePackage($filter, $packageName) != $expected)
	{
		++$mismatches;
		if ($expected)
		{
			print "BAD: good package $packageName did not match!\n";
			++$goodMismatches;
		}
		else
		{
			print "BAD: bad package $packageName did match!\n";
			++$badMatches;
		}
	}
	else
	{
		++$matches;
	 	if ($main::verbose)
		{
			if ($expected)
			{
				print "GOOD: good package $packageName matched\n";
			}
			else
			{
				print "GOOD: bad package $packageName did not match\n";
			}
		}
	}
}

## This trains the given filter with the given package
##
## @param $filter the filter to use for categorizing
## @param $packageName the name of package to be used for categorizing
## @returns if the package is good (0) or bad (1)
sub categorizePackage()
{
	my ($filter, $packageName) = @_;
	my $package = Package->new($packageName);
	my ($good, $posterior) = $filter->testEntry($packageName."\n".$package->getCategorizerText());
	print "Categorized $packageName as ".(($good) ? "good\n" : "bad\n") if ($main::verbose>2);
	return ($good, $posterior);
}

## Returns the list of packages in the handed file.
## 
## The file must contain one package per line only.
## @returns the packages as an array
sub readPackagesFromFile
{
	my ($file) = @_;
	open FILE, $file || die "Could not open file $file for reading.";
	my @result;
	while (<FILE>)
	{
		chomp($_);
		push(@result, $_);
	}
	return @result;
}

#foreach (@packages)
#{
#	print $package->getName()."\n".$package->getCategorizerText()."\n";
#	print "-" x 80 . "\n";
#}

#my $tagDB = BayesDB->new($tag);
#$tagDB->printTagName();
#
#$tagDB->addGoodToken("Perl");
#$tagDB->addGoodToken("Perl");
#$tagDB->addGoodToken("Viagra");
#print "Token Perl: ".$tagDB->goodTokenCount("Perl")."\n";
#print "Token Viagra: ".$tagDB->goodTokenCount("Viagra")."\n";
#print "Token test: "."\n" if (defined($test));
#$tagDB->close();

package NaiveBayesianFilter;

sub min
{
	return ($_[0]<$_[1]) ? $_[0] : $_[1];
}

sub max
{
	return ($_[0]>$_[1]) ? $_[0] : $_[1];
}

=item C<new> I<tokenizer>

This creates a bayesian filter, which can be trained and 
asked if a given entry is good or bad. 

B<Parameter>

=over 6

=item tokenizer

the tokenizer to be used

=item directory

The directory where to store the data.

=back

=cut

sub new
{
	shift @_;
	my ($tokenizer, $directory) = @_;
	my $instance = {	tokenizer => $tokenizer ,
							directory => $directory,
							goodEntries => 0,
							badEntries => 0,
							goodDb => TokenDB->new( File::Spec->catdir($directory,"good.db") ),
							badDb => TokenDB->new( File::Spec->catdir($directory,"bad.db") ),
							# from which treshhold an entry should be considered good
							goodThreshold => 0.9,
							# how many words should be considered for categorization
							considerations => 15,
							# Maps the tokens to their likelyhood values
							tokenToLikelihood => {},
						};
	my $countFile = File::Spec->catdir($directory, "countfile");
	if ( -e  $countFile)	# if we loaded an existing database
	{
		open(COUNT_FILE, $countFile) || die "Could not open $countFile for reading\n";;
		$instance->{goodEntries} = <COUNT_FILE>;
		chomp($instance->{goodEntries});
		$instance->{badEntries} = <COUNT_FILE>;
		chomp($instance->{badEntries});
		close(COUNT_FILE);
		print "Loaded $instance->{goodEntries} good and $instance->{badEntries} bad messages.\n" 
			if ($main::verbose);
	}
	bless $instance;
	return $instance;
}

sub save()
{
	my ($this) = @_;
	my $countFile = File::Spec->catdir($this->{directory}, "countfile");
	open(COUNT_FILE, "> $countFile") || die "Could not open $countFile for reading\n";
	print COUNT_FILE "$this->{goodEntries}\n";
	print COUNT_FILE "$this->{badEntries}\n";
	close(COUNT_FILE);
}

=item C<addGoodEntry> I<entry>

Adds the given entry to the good ones.

=cut

sub addGoodEntry
{
	my $this = shift @_;
	my $entry = shift(@_);	# do not copy the string, get a reference
	$this->{goodDb}->addTokens($this->{tokenizer}->tokenize($entry));
	$this->incrementGood();
	$this->{tokenToLikelihood} = {};
}

=item C<addBadEntry> I<entry>

Adds the given entry to the bad ones.

=cut
sub addBadEntry
{
	my $this = shift @_;
	my $entry = shift(@_);	# do not copy the string, get a reference
	$this->{badDb}->addTokens($this->{tokenizer}->tokenize($entry));
	$this->incrementBad();
	$this->{tokenToLikelihood} = {};
}

## Tests if the given entry is good or bad.
##
## @param entry the entry to be tested
## @returns two values: if the entry was good as first, and its $posterior 
## as second value
sub testEntry
{
	my $this = shift @_;
	my $entry = shift(@_);	# do not copy the string, get a reference
	my @tokens = $this->{tokenizer}->tokenize($entry);
	#foreach (@tokens)
	#{
		my $posterior = $this->calculatePosterior(@tokens);
		print "Good posterior: ".$posterior."\n" if ($main::verbose);
	#}
	return ($posterior > $this->{goodThreshold}), $posterior;
}

sub calculatePosterior
{
	my $this = shift @_;
	my @tokens = @_;
	# here the items will be stored, where those with the likelihood
	# with the highest difference to 0.5 will appear first
	my $pQueue = Heap::Priority->new();
	my %set;
	foreach my $token (@tokens)
	{
		# if the token was allready used
		next if ( exists($set{$token}) );
		$set{$token} = 1;	# mark this token as used
		# TODO we can spare the token name here, but for debug purposes we keep them
		my $lh = $this->getLikelihood($token);
		$pQueue->add([$token, $lh], abs(0.5-$lh));	
	}
	# Paul Grahams formula a little optimized for numeric operations looks like this:
	# 1/(1 + (1-a)/a * (1-b)/b * (1-c)/c *...* (1-n)/*n )
	my $factor = 1;
	for (my $i=0; $i < $this->{considerations}; ++$i)
	{
		# last if (!$pQueue->count());	# only available for SPARK
		my $pair = $pQueue->pop();
		last if (!defined($pair));
		my ($token, $lh) = @$pair;
		print "Posterior: ($token, $lh)\n" if ($main::verbose > 1);
		$factor *= (1-$lh)/$lh;
	}
	return 1/(1 + $factor);
}

=item C<getLikelihood> I<token>

This returns the likelihood for the given token.

The results is cached in tokenToLikelihood.

=cut

sub getLikelihood
{
	my ($this, $token) = @_;
	my $lh = $this->{tokenToLikelihood}{$token};
	if ( !defined($lh) )	
	{
		$lh = $this->calculateLikelihood($token);
		$this->{tokenToLikelihood}{$token} = $lh;
	}
	return $lh;
}

sub calculateLikelihood
{
	my ($this, $token) = @_;
	my $goodCount = $this->{goodDb}->tokenCount($token);
	my $badCount = $this->{badDb}->tokenCount($token);
	return 0.4 if ($goodCount == 0 && $badCount == 0);
	my $goodRatio = 0.01;
	my $badRatio = 0.01;
	if ($goodCount)
	{
		$goodRatio = max(0.01, min($goodCount/$this->{goodEntries}, 0.99) );	
	}
	if ($badCount)
	{
		$badRatio  = max(0.01, min( $badCount/$this->{badEntries} , 0.99) );
	}
	return $goodRatio/($goodRatio+$badRatio);	# likelihood
}

## @brief This calculate the prior probability of stuff belonging
## to the good set
## @note Currently this divides the number of good entries
## by the number of bad ones, so your training set should better
## be representative.
## private function
sub calculatePrior()
{
	my ($this) = @_;
	$this->{prior} = $this->{goodEntries} / ($this->{goodEntries} + $this->{badEntries});
}

sub incrementGood()
{
	my ($this) = @_;
	++$this->{goodEntries};
	$this->calculatePrior();
}

sub incrementBad()
{
	my ($this) = @_;
	++$this->{badEntries};
	$this->calculatePrior();
}

package TokenDB;

## This creates a database of simple string tokens.
## 
## @param $filename the name of the file of this database
sub new
{
	shift @_;
	my ($filename) = @_;
	my $instance = {	dbFile => {} ,	# holds the array which is used to store information
#							dbHandle => 0,	# holds a handle to the to the BerkleyDB API object
						};
	bless $instance;
	#$instance->{dbHandle} = 
	tie( %{$instance->{dbFile}}, "DB_File", "$filename");
	print "Loaded ".scalar(keys %{$instance->{dbFile}})." tokens from $filename\n" if ($main::verbose);
	return $instance;
}

## @brief This is the destructor which closes the DB.
sub close
{
	my ($this) = @_;
#	undef $this->{dbHandle};	# allow us to untie dbFile
	untie %{$this->{dbFile}};
}

## @brief Adds a single token to the database.
sub addToken
{
	my ($this, $token) = @_;
	if ($this->{dbFile}{$token})
	{
		++$this->{dbFile}{$token};
	}
	else
	{
		$this->{dbFile}{$token} = 1;
	}
}

## Adds an array of tokens to the database.
sub addTokens
{
	my $this = shift(@_);
	foreach (@_)
	{
		$this->addToken($_);
	}
}

## @brief Returns how often the given token is counted in the
## database.
## 
## @param $token the name of the token to get the information for
##
sub tokenCount
{
	my ($this, $token) = @_;
	my $count = $this->{dbFile}{$token};
	return defined($count) ? $count : 0;
}

package Tokenizer;

## @brief Creates this tokenizer. It is a very simple one, splitting
## only on some special characters.
## 
sub new
{
	shift @_;
	my $instance = {	discards => []	# the words to be discarded from the token
												# stream because they are too common
						};
	bless $instance;
	$instance->initDiscards();
	return $instance;
}

## private function
sub initDiscards
{
	my ($this) = @_;
	my @tmpArray = qw( a also an and are at be by can for from it in is on of the this to with you);
	$this->{discards} = \@tmpArray;
}

## @brief This tokenizes the handed string returning an array of 
## all tokens recognized in order of appearance
## @param $string the string to tokenize
## @returns an array of all tokens found
sub tokenize
{
	my $this = shift @_;
	# currenlty the very simplest thing, splitting by ws,
	# we can dive into greater detail later
	my @tokens = split(/[\s:\.\(\)\*,;]+/ , $_[0]);	# split by everything possible
	my @resultTokens;
	foreach my $token (@tokens)
	{
		$token = lc($token);
		next if (!$token);	# if the token is an empty string TODO can we prevent this with some nifty split parameters?
		# if the token is uninteresting
		# TODO search using binary search here
		next if ( grep( ($_ eq $token), @{$this->{discards}} ) );
		push @resultTokens, $token;
	}
	return @resultTokens;
}

package Package;

my $cache;

sub init
{
	$cache = AptPkg::Cache->new();
}

sub new
{
	shift @_;
	my ($name) = @_;
	my $instance = {	name => $name,
							description => undef,
							depends => undef,
							provides => undef,
							tags => undef,
						};
	bless $instance;
	return $instance;
}

sub getCategorizerText
{
	my ($this) = @_;
	my $text = "$this->{name}\n";
	# depends will be \% and provides \@
	my ($depends, $provides) = $this->getDepends();	
	$text .= join("\n", keys(%$depends))."\n";
	$text .= join("\n", @$provides)."\n";
	$text .= $this->getDescription();
#	print(("-" x 80) . "\n$text\n" . ("-"x 80). "\n");
}

## @brief Returns the description of the package.
##
## This value is lazy initialized, i.e. after being requested
## once, it will be accessible fast (and consume you precious memory).
sub getDescription
{
	my ($this) = @_;
	if ( !defined($this->{description}) )
	{
		my $package = $cache->{$this->{name}};	# currently not used
		# return if (!$package->{Section});
		$this->{description} = $cache->packages()->lookup($this->{name})->{LongDesc};
	}
	return $this->{description};
}

## Returns the name of this package
sub getName
{
	return $_[0]->{name};
}

## Returns the depends for the given package mapped to the type of
## dependencies (Depend, PreDepend,...) as reference and a second reference
## to the provides of the package.
##
## This value is lazy initialized, i.e. after being requested
## once, it will be accessible fast (and consume you precious memory).
sub getDepends
{
	my ($this) = @_;
	if ( !defined($this->{depends}) )
	{
		my %depends = ();
		my @provides = ();
		my $package = $cache->{$this->{name}};
		# returns a reference to an array of Version packages
		my $versionList = $package->{VersionList};
		if ( $versionList && scalar( @$versionList) )
		{
			# take the first entry from the list (which should be the newest version)
			my $version = ${@{$versionList}}[0];
			# iterate over all dependencies 
			# brr... I hate perls oo syntax, you get really mad with it!
			if ($version->{DependsList})
			{
				foreach my $depend ( @{$version->{DependsList}} )
				{
					$depends{$depend->{TargetPkg}->{Name}} = $depend->{DebType};
				}
			}
			if ($version->{ProvidesList})
			{
				foreach my $provide ( @{$version->{ProvidesList}} )
				{
					push @provides, $provide->{Name};
				}
			}
		}
		else
		{	
			print "No information available for package $this->{name}\n" 
				if ($main::verbose);
		}
		$this->{depends} = \%depends;
		$this->{provides} = \@provides;
	}
	return ( $this->{depends}, $this->{provides} );
}

#	my ($patches) = @_;
#	my $recs = $cache->packages();
#	for my $name (keys %$cache)
#	{
#		my $pkg = $cache->{$name};
#
#		# Skip non-packages
#		next if not $pkg->{Section};
#
#		# Infer from regexps matching package name
#		for my $rec (@infer_names_data)
#		{
#			if ($name =~ $rec->[0])
#			{
#				#print "INFER NAME $name: ", join(', ', @{$rec->[1]{add}}), "\n";
#				do_action($patches, $name, $rec->[1]);
#			}
#		}
#
#		# Infer from package section
#		if (exists $infer_section_data{$pkg->{Section}})
#		{
#			my $action = $infer_section_data{$pkg->{Section}};
#			do_action($patches, $name, $action);
#		}
#	
#
#		# Infer from reverse dependencies
#		if (exists $infer_deps_data{$name})
#		{
#			my $action = $infer_deps_data{$name};
#
#			# Iterate reverse dependencies
#			my $revdeps = $pkg->{RevDependsList};
#			for my $r (@$revdeps)
#			{
#				next if $r->{DepType} != 1;
#
#				#print "INFER DEPS ", $r->{ParentPkg}{Name}, ": ", join(', ', @{$action->{add}}), "\n";
#				do_action($patches, $r->{ParentPkg}{Name}, $action);
#			}
#		}
#
#		# TODO: use keywords in descriptions
#		#       works for technologies and protocols
#		# TODO: use libbow to extract keywords from package descriptions
#		# TODO: use popcon data to give a frequency estimate by aggregating values
#		#       in a scale (linear or logaritmic?)
#		# TODO: use debram data
#	}

#end of method list
=back

=cut

__END__

--=-INbXfaqSIogeBte/tUki--