r44447 - in /trunk/libparse-mediawikidump-perl: ./ debian/ lib/Parse/ lib/Parse/MediaWikiDump/ t/
jawnsy-guest at users.alioth.debian.org
jawnsy-guest at users.alioth.debian.org
Sun Sep 20 19:31:28 UTC 2009
Author: jawnsy-guest
Date: Sun Sep 20 19:31:22 2009
New Revision: 44447
URL: http://svn.debian.org/wsvn/pkg-perl/?sc=1&rev=44447
Log:
+ Pages is now a subclass of Revisions
+ Fix a memory leak in Pages and Revisions
+ Now uses Test::Exception
Added:
trunk/libparse-mediawikidump-perl/t/pages-single-revision-only.t
- copied unchanged from r44446, branches/upstream/libparse-mediawikidump-perl/current/t/pages-single-revision-only.t
Modified:
trunk/libparse-mediawikidump-perl/Changes
trunk/libparse-mediawikidump-perl/MANIFEST
trunk/libparse-mediawikidump-perl/META.yml
trunk/libparse-mediawikidump-perl/Makefile.PL
trunk/libparse-mediawikidump-perl/TODO
trunk/libparse-mediawikidump-perl/debian/changelog
trunk/libparse-mediawikidump-perl/debian/control
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/CategoryLinks.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Links.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Pages.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Revisions.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/XML.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/category_link.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/link.pm
trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/page.pm
trunk/libparse-mediawikidump-perl/t/pages.t
trunk/libparse-mediawikidump-perl/t/pages_test.xml
Modified: trunk/libparse-mediawikidump-perl/Changes
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/Changes?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/Changes (original)
+++ trunk/libparse-mediawikidump-perl/Changes Sun Sep 20 19:31:22 2009
@@ -1,4 +1,8 @@
Revision history for Parse-MediaWikiDump
+
+0.93 Sep 15, 2009
+ * Made ::Pages a subclass of ::Revisions
+ * Discovered a bug regression: ::Pages and ::Revisions leak memory/are not properly garbage collected
0.92 Apr 15, 2009
* Completed documentation for all modules
Modified: trunk/libparse-mediawikidump-perl/MANIFEST
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/MANIFEST?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/MANIFEST (original)
+++ trunk/libparse-mediawikidump-perl/MANIFEST Sun Sep 20 19:31:22 2009
@@ -14,6 +14,7 @@
t/revisions_test.xml
t/revisions.t
t/pre-factory.t
+t/pages-single-revision-only.t
lib/Parse/MediaWikiDump/category_link.pm
lib/Parse/MediaWikiDump/CategoryLinks.pm
lib/Parse/MediaWikiDump/link.pm
Modified: trunk/libparse-mediawikidump-perl/META.yml
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/META.yml?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/META.yml (original)
+++ trunk/libparse-mediawikidump-perl/META.yml Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
--- #YAML:1.0
name: Parse-MediaWikiDump
-version: 0.92
+version: 0.93
abstract: Tools to process MediaWiki dump files
author:
- Tyler Riddle <triddle at gmail.com>
@@ -11,6 +11,7 @@
requires:
List::Util: 0
Object::Destroyer: 0
+ Test::Exception: 0
Test::More: 0
XML::Parser: 0
no_index:
Modified: trunk/libparse-mediawikidump-perl/Makefile.PL
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/Makefile.PL?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/Makefile.PL (original)
+++ trunk/libparse-mediawikidump-perl/Makefile.PL Sun Sep 20 19:31:22 2009
@@ -14,6 +14,7 @@
'XML::Parser' => 0,
'List::Util' => 0,
'Object::Destroyer' => 0,
+ 'Test::Exception' => 0,
},
dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
clean => { FILES => 'Parse-MediaWikiDump-*' },
Modified: trunk/libparse-mediawikidump-perl/TODO
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/TODO?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/TODO (original)
+++ trunk/libparse-mediawikidump-perl/TODO Sun Sep 20 19:31:22 2009
@@ -1,6 +1,3 @@
- * Make tests for Pages as thorough as tests for Revisions
- * Update revisions to make it easier to subclass and change the behavior
- * Port Pages over to being a sucblass of Revisions that enforces only a single page per article
- revision
+ * Fix memory leak bug
* Investigate if using pop for removing parsed items from the buffer will make Pages and Revisions faster;
if so, add an option for such
Modified: trunk/libparse-mediawikidump-perl/debian/changelog
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/debian/changelog?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/debian/changelog (original)
+++ trunk/libparse-mediawikidump-perl/debian/changelog Sun Sep 20 19:31:22 2009
@@ -1,11 +1,10 @@
-libparse-mediawikidump-perl (0.92-1) UNRELEASED; urgency=low
-
- No release necessary
- IGNORE-VERSION: 0.92-1
+libparse-mediawikidump-perl (0.93-1) UNRELEASED; urgency=low
[ Jonathan Yu ]
* New upstream release
- + Completed documentation
+ + Pages is now a subclass of Revisions
+ + Fix a memory leak in Pages and Revisions
+ + Now uses Test::Exception
* Changed to new debhelper format
* Updated copyright information
* Added myself to Uploaders and Copyright
@@ -17,7 +16,7 @@
[ Ryan Niebur ]
* Update jawnsy's email address
- -- Ryan Niebur <ryanryan52 at gmail.com> Tue, 01 Sep 2009 21:19:28 -0700
+ -- Jonathan Yu <jawnsy at cpan.org> Sun, 20 Sep 2009 11:31:54 -0400
libparse-mediawikidump-perl (0.91-1) unstable; urgency=low
Modified: trunk/libparse-mediawikidump-perl/debian/control
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/debian/control?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/debian/control (original)
+++ trunk/libparse-mediawikidump-perl/debian/control Sun Sep 20 19:31:22 2009
@@ -2,7 +2,8 @@
Section: perl
Priority: optional
Build-Depends: debhelper (>= 7)
-Build-Depends-Indep: perl, libxml-parser-perl, libobject-destroyer-perl
+Build-Depends-Indep: perl, libxml-parser-perl, libobject-destroyer-perl,
+ libtest-exception-perl
Maintainer: Debian Perl Group <pkg-perl-maintainers at lists.alioth.debian.org>
Uploaders: Xavier Oswald <xoswald at debian.org>,
gregor herrmann <gregoa at debian.org>, Ansgar Burchardt <ansgar at 43-1.org>,
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump.pm Sun Sep 20 19:31:22 2009
@@ -1,5 +1,5 @@
package Parse::MediaWikiDump;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
use Parse::MediaWikiDump::XML;
use Parse::MediaWikiDump::Revisions;
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/CategoryLinks.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/CategoryLinks.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/CategoryLinks.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/CategoryLinks.pm Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
package Parse::MediaWikiDump::CategoryLinks;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
use strict;
use warnings;
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Links.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Links.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Links.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Links.pm Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
package Parse::MediaWikiDump::Links;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
use strict;
use warnings;
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Pages.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Pages.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Pages.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Pages.pm Sun Sep 20 19:31:22 2009
@@ -1,746 +1,87 @@
package Parse::MediaWikiDump::Pages;
-our $VERSION = '0.92';
-
-#This parser works by placing all of the start, text, and end events into
-#a buffer as they come out of XML::Parser. On each call to page() the function
-#checks for a complete article in the buffer and calls for XML::Parser to add
-#more tokens if a complete article is not found. Once a complete article is
-#found it is removed from the buffer, parsed, and an instance of the page
-#object is returned.
-
-use 5.8.0;
+our $VERSION = '0.93';
+
+use base qw(Parse::MediaWikiDump::Revisions);
use strict;
use warnings;
-use List::Util;
-use XML::Parser;
use Carp;
-sub new {
- my ($class, $source) = @_;
- my $self = {};
- my $parser_state = {}; #Hash::NoRef->new;
-
- bless ($self, $class);
-
- $$self{PARSER} = XML::Parser->new(ProtocolEncoding => 'UTF-8');
- $$self{PARSER}->setHandlers('Start', \&start_handler,
- 'End', \&end_handler);
-
- $$self{GOOD_TAGS} = make_good_tags();
- $$self{BUFFER} = [];
- $$self{CHUNK_SIZE} = 32768;
- $$self{BUF_LIMIT} = 10000;
- $$self{BYTE} = 0;
-
- $parser_state->{GOOD_TAGS} = $$self{GOOD_TAGS};
- $parser_state->{BUFFER} = $$self{BUFFER};
-
- my $expat_bb = $$self{PARSER}->parse_start(state => $parser_state);
- $$self{EXPAT} = Object::Destroyer->new($expat_bb, 'parse_done');
-
- $self->open($source);
- $self->init;
-
- return $self;
+sub new_accumulator_engine {
+ my ($self) = @_;
+ my $f = Parse::MediaWikiDump::XML::Accumulator->new;
+ my $store_siteinfo = $self->{SITEINFO};
+ my $store_page = $self->{PAGE_LIST};
+
+ my $root = $f->root;
+ my $mediawiki = $f->node('mediawiki', Start => \&validate_mediawiki_node);
+
+ #stuff for siteinfo
+ my $siteinfo = $f->node('siteinfo', End => sub { %$store_siteinfo = %{ $_[1] } } );
+ my $sitename = $f->textcapture('sitename');
+ my $base = $f->textcapture('base');
+ my $generator = $f->textcapture('generator');
+ my $case = $f->textcapture('case');
+ my $namespaces = $f->node('namespaces', Start => sub { $_[1]->{namespaces} = []; } );
+ my $namespace = $f->node('namespace', Character => \&save_namespace_node);
+
+ #stuff for page entries
+ my $page = $f->node('page', Start => sub { $_[0]->accumulator( {} ) } );
+ my $title = $f->textcapture('title');
+ my $id = $f->textcapture('id');
+ my $revision = $f->node('revision',
+ Start => sub { $_[1]->{minor} = 0 },
+ End => sub {
+ if (defined($_[1]->{seen_revision})) {
+ die "only one revision per page is allowed\n";
+ }
+
+ $_[1]->{seen_revision} = 1;
+
+ push(@$store_page, { %{ $_[1] } } );
+ } );
+ my $rev_id = $f->textcapture('id', 'revision_id');
+ my $minor = $f->node('minor', Start => sub { $_[1]->{minor} = 1 } );
+ my $time = $f->textcapture('timestamp');
+ my $contributor = $f->node('contributor');
+ my $username = $f->textcapture('username');
+ my $ip = $f->textcapture('ip');
+ my $contrib_id = $f->textcapture('id', 'userid');
+ my $comment = $f->textcapture('comment');
+ my $text = $f->textcapture('text');
+ my $restr = $f->textcapture('restrictions');
+
+ #put together the tree
+ $siteinfo->add_child($sitename, $base, $generator, $case, $namespaces);
+ $namespaces->add_child($namespace);
+
+ $page->add_child($title, $id, $revision, $restr);
+ $revision->add_child($rev_id, $time, $contributor, $minor, $comment, $text);
+ $contributor->add_child($username, $ip, $contrib_id);
+
+ $mediawiki->add_child($siteinfo, $page);
+ $root->add_child($mediawiki);
+
+ my $engine = $f->engine($root, {});
+
+ return $engine;
}
-sub next {
- my ($self) = @_;
- my $buffer = $$self{BUFFER};
- my $offset;
- my @page;
-
- #look through the contents of our buffer for a complete article; fill
- #the buffer with more data if an entire article is not there
- while(1) {
- $offset = $self->search_buffer('/page');
- last if $offset != -1;
-
- #indicates EOF
- return undef unless $self->parse_more;
- }
-
- #remove the entire page from the buffer
- @page = splice(@$buffer, 0, $offset + 1);
-
- if ($page[0][0] ne 'page') {
- $self->dump($buffer);
- die "expected <page>; got " . token2text($page[0]);
- }
-
- my $data = $self->parse_page(\@page);
-
- return Parse::MediaWikiDump::page->new($data, $$self{CATEGORY_ANCHOR},
- $$self{HEAD}{CASE}, $$self{HEAD}{namespaces});
+sub validate_mediawiki_node {
+ my ($engine, $a, $element, $attrs) = @_;
+ die "Only version 0.3 dump files are supported" unless $attrs->{version} eq '0.3';
}
-#outputs a nicely formated representation of the tokens on the buffer specified
-sub dump {
- my ($self, $buffer) = @_;
- my $offset = 0;
-
- if (! defined($buffer)) {
- $buffer = $$self{BUFFER};
- }
-
- foreach my $i (0 .. $#$buffer) {
- my $token = $$buffer[$i];
-
- print STDERR "$i ";
-
- if (substr($$token[0], 0, 1) ne '/') {
- my $attr = $$token[1];
- print STDERR " " x $offset;
- print STDERR "START $$token[0] ";
-
- foreach my $key (sort(keys(%$attr))) {
- print STDERR "$key=\"$$attr{$key}\" ";
- }
-
- print STDERR "\n";
- $offset++;
- } elsif (ref $token eq 'ARRAY') {
- $offset--;
- print STDERR " " x $offset;
- print STDERR "END $$token[0]\n";
- } elsif (ref $token eq 'SCALAR') {
- my $ref = $token;
- print STDERR " " x $offset;
- print STDERR "TEXT ";
-
- my $len = length($$ref);
-
- if ($len < 50) {
- print STDERR "'$$ref'\n";
- } else {
- print STDERR "$len characters\n";
- }
- }
- }
-
- return 1;
+sub save_namespace_node {
+ my ($parser, $accum, $text, $element, $attrs) = @_;
+ my $key = $attrs->{key};
+ my $namespaces = $accum->{namespaces};
+
+ push(@{ $accum->{namespaces} }, [$key, $text] );
}
-sub sitename {
- my ($self) = @_;
- return $$self{HEAD}{sitename};
-}
-
-sub base {
- my ($self) = @_;
- return $$self{HEAD}{base};
-}
-
-sub generator {
- my ($self) = @_;
- return $$self{HEAD}{generator};
-}
-
-sub case {
- my ($self) = @_;
- return $$self{HEAD}{case};
-}
-
-sub namespaces {
- my ($self) = @_;
- return $$self{HEAD}{namespaces};
-}
-
-sub namespaces_names {
- my $self = shift;
- return $$self{HEAD}{namespaces_names};
-}
-
-sub current_byte {
- my ($self) = @_;
- return $$self{BYTE};
-}
-
-sub size {
- my ($self) = @_;
-
- return undef unless defined $$self{SOURCE_FILE};
-
- my @stat = stat($$self{SOURCE_FILE});
-
- return $stat[7];
-}
-
-#depreciated backwards compatibility methods
-
-#replaced by next()
-sub page {
- my ($self) = @_;
-
- carp("the page() method is depreciated and is going away in the future, use next() instead");
-
- return $self->next(@_);
-}
-
-#private functions with OO interface
-sub open {
- my ($self, $source) = @_;
-
- if (ref($source) eq 'GLOB') {
- $$self{SOURCE} = $source;
- } else {
- if (! open($$self{SOURCE}, $source)) {
- die "could not open $source: $!";
- }
-
- $$self{SOURCE_FILE} = $source;
- }
-
- binmode($$self{SOURCE}, ':utf8');
-
- return 1;
-}
-
-sub init {
- my ($self) = @_;
- my $offset;
- my @head;
-
- #parse more XML until the entire siteinfo section is in the buffer
- while(1) {
- die "could not init" unless $self->parse_more;
-
- $offset = $self->search_buffer('/siteinfo');
-
- last if $offset != -1;
- }
-
- #pull the siteinfo section out of the buffer
- @head = splice(@{$$self{BUFFER}}, 0, $offset + 1);
-
- $self->parse_head(\@head);
-
- return 1;
-}
-
-#feed data into expat and have it put more tokens onto the buffer
-sub parse_more {
- my ($self) = @_;
- my $buf;
-
- my $read = read($$self{SOURCE}, $buf, $$self{CHUNK_SIZE});
-
- if (! defined($read)) {
- die "error during read: $!";
- } elsif ($read == 0) {
- $$self{FINISHED} = 1;
- $$self{EXPAT} = undef; #Object::Destroyer invokes parse_done()
- return 0;
- }
-
- $$self{BYTE} += $read;
- $$self{EXPAT}->parse_more($buf);
-
- my $buflen = scalar(@{$$self{BUFFER}});
-
- die "buffer length of $buflen exceeds $$self{BUF_LIMIT}" unless
- $buflen < $$self{BUF_LIMIT};
-
- return 1;
-}
-
-#searches through a buffer for a specified token
-sub search_buffer {
- my ($self, $search, $list) = @_;
-
- $list = $$self{BUFFER} unless defined $list;
-
- return -1 if scalar(@$list) == 0;
-
- foreach my $i (0 .. $#$list) {
- return $i if ref $$list[$i] eq 'ARRAY' && $list->[$i][0] eq $search;
- }
-
- return -1;
-}
-
-#this function is very frightning :-(
-#a better alternative would be to have each part of the stack handled by a
-#function that handles all the logic for that specific node in the tree
-sub parse_head {
- my ($self, $buffer) = @_;
- my $state = 'start';
- my %data = (
- namespaces => [],
- namespaces_names => [],
- );
-
- for (my $i = 0; $i <= $#$buffer; $i++) {
- my $token = $$buffer[$i];
-
- if ($state eq 'start') {
- my $version;
- die "$i: expected <mediawiki> got " . token2text($token) unless
- $$token[0] eq 'mediawiki';
-
- die "$i: version is a required attribute" unless
- defined($version = $$token[1]->{version});
-
- die "$i: version $version unsupported" unless $version eq '0.3';
-
- $token = $$buffer[++$i];
-
- die "$i: expected <siteinfo> got " . token2text($token) unless
- $$token[0] eq 'siteinfo';
-
- $state = 'in_siteinfo';
- } elsif ($state eq 'in_siteinfo') {
- if ($$token[0] eq 'namespaces') {
- $state = 'in_namespaces';
- next;
- } elsif ($$token[0] eq '/siteinfo') {
- last;
- } elsif ($$token[0] eq 'sitename') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- die "$i: expected TEXT but got " . token2text($token);
- }
-
- $data{sitename} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/sitename') {
- die "$i: expected </sitename> but got " . token2text($token);
- }
- } elsif ($$token[0] eq 'base') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT but got " . token2text($token);
- }
-
- $data{base} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/base') {
- $self->dump($buffer);
- die "$i: expected </base> but got " . token2text($token);
- }
-
- } elsif ($$token[0] eq 'generator') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT but got " . token2text($token);
- }
-
- $data{generator} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/generator') {
- $self->dump($buffer);
- die "$i: expected </generator> but got " . token2text($token);
- }
-
- } elsif ($$token[0] eq 'case') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected </case> but got " . token2text($token);
- }
-
- $data{case} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/case') {
- $self->dump($buffer);
- die "$i: expected </case> but got " . token2text($token);
- }
- }
-
- } elsif ($state eq 'in_namespaces') {
- my $key;
- my $name;
-
- if ($$token[0] eq '/namespaces') {
- $state = 'in_siteinfo';
- next;
- }
-
- if ($$token[0] ne 'namespace') {
- die "$i: expected <namespace> or </namespaces>; got " . token2text($token);
- }
-
- die "$i: key is a required attribute" unless
- defined($key = $$token[1]->{key});
-
- $token = $$buffer[++$i];
-
- #the default namespace has no text associated with it
- if (ref $token eq 'SCALAR') {
- $name = $$token;
- } elsif ($$token[0] eq '/namespace') {
- $name = '';
- $i--; #move back one for below
- } else {
- die "$i: should never happen";
- }
-
- push(@{$data{namespaces}}, [$key, $name]);
- push(@{$data{namespaces_names}}, $name);
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/namespace') {
- $self->dump($buffer);
- die "$i: expected </namespace> but got " . token2text($token);
- }
-
- } else {
- die "$i: unknown state '$state'";
- }
- }
-
- $$self{HEAD} = \%data;
-
- #locate the anchor that indicates what looks like a link is really a
- #category assignment ([[foo]] vs [[Category:foo]])
- #fix for bug #16616
- foreach my $ns (@{$data{namespaces}}) {
- #namespace 14 is the category namespace
- if ($$ns[0] == 14) {
- $$self{CATEGORY_ANCHOR} = $$ns[1];
- last;
- }
- }
-
- if (! defined($$self{CATEGORY_ANCHOR})) {
- die "Could not locate category indicator in namespace definitions";
- }
-
- return 1;
-}
-
-#this function is very frightning :-(
-#see the parse_head function comments for thoughts on improving these
-#awful functions
-sub parse_page {
- my ($self, $buffer) = @_;
- my %data;
- my $state = 'start';
-
- for (my $i = 0; $i <= $#$buffer; $i++) {
- my $token = $$buffer[$i];
-
-
- if ($state eq 'start') {
- if ($$token[0] ne 'page') {
- $self->dump($buffer);
- die "$i: expected <page>; got " . token2text($token);
- }
-
- $state = 'in_page';
- } elsif ($state eq 'in_page') {
- next unless ref $token eq 'ARRAY';
- if ($$token[0] eq 'revision') {
- $state = 'in_revision';
- next;
- } elsif ($$token[0] eq '/page') {
- last;
- } elsif ($$token[0] eq 'title') {
- $token = $$buffer[++$i];
-
- if (ref $token eq 'ARRAY' && $$token[0] eq '/title') {
- $data{title} = '';
- next;
- }
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{title} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/title') {
- $self->dump($buffer);
- die "$i: expected </title>; got " . token2text($token);
- }
- } elsif ($$token[0] eq 'id') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{id} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/id') {
- $self->dump($buffer);
- die "$i: expected </id>; got " . token2text($token);
- }
- }
- } elsif ($state eq 'in_revision') {
- if ($$token[0] eq '/revision') {
- #If a comprehensive dump file is parsed
- #it can cause uncontrolled stack growth and the
- #parser only returns one revision out of
- #all revisions - if we run into a
- #comprehensive dump file, indicated by more
- #than one <revision> section inside a <page>
- #section then die with a message
-
- #just peeking ahead, don't want to update
- #the index
- $token = $$buffer[$i + 1];
-
- if ($$token[0] eq 'revision') {
- die "unable to properly parse comprehensive dump files";
- }
-
- $state = 'in_page';
- next;
- } elsif ($$token[0] eq 'contributor') {
- $state = 'in_contributor';
- next;
- } elsif ($$token[0] eq 'id') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{revision_id} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/id') {
- $self->dump($buffer);
- die "$i: expected </id>; got " . token2text($token);
- }
-
- } elsif ($$token[0] eq 'timestamp') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{timestamp} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/timestamp') {
- $self->dump($buffer);
- die "$i: expected </timestamp>; got " . token2text($token);
- }
- } elsif ($$token[0] eq 'minor') {
- $data{minor} = 1;
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/minor') {
- $self->dump($buffer);
- die "$i: expected </minor>; got " . token2text($token);
- }
- } elsif ($$token[0] eq 'comment') {
- $token = $$buffer[++$i];
-
- #account for possible null-text
- if (ref $token eq 'ARRAY' && $$token[0] eq '/comment') {
- $data{comment} = '';
- next;
- }
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{comment} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/comment') {
- $self->dump($buffer);
- die "$i: expected </comment>; got " . token2text($token);
- }
-
- } elsif ($$token[0] eq 'text') {
- my $token = $$buffer[++$i];
-
- if (ref $token eq 'ARRAY' && $$token[0] eq '/text') {
- $data{text} = '';
- next;
- } elsif (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expected TEXT; got " . token2text($token);
- }
-
- $data{text} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/text') {
- $self->dump($buffer);
- die "$i: expected </text>; got " . token2text($token);
- }
-
- }
-
- } elsif ($state eq 'in_contributor') {
- next unless ref $token eq 'ARRAY';
- if ($$token[0] eq '/contributor') {
- $state = 'in_revision';
- next;
- } elsif (ref $token eq 'ARRAY' && $$token[0] eq 'username') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expecting TEXT; got " . token2text($token);
- }
-
- $data{username} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/username') {
- $self->dump($buffer);
- die "$i: expected </username>; got " . token2text($token);
- }
-
- } elsif ($$token[0] eq 'id') {
- $token = $$buffer[++$i];
-
- if (ref $token ne 'SCALAR') {
- $self->dump($buffer);
- die "$i: expecting TEXT; got " . token2text($token);
- }
-
- $data{userid} = $$token;
-
- $token = $$buffer[++$i];
-
- if ($$token[0] ne '/id') {
- $self->dump($buffer);
- die "$i: expecting </id>; got " . token2text($token);
- }
- }
- } else {
- die "unknown state: $state";
- }
- }
-
- $data{namespace} = '';
- # Many pages just have a : in the title, but it's not necessary
- # a namespace designation.
- if ($data{title} =~ m/^([^:]+)\:/) {
- my $possible_namespace = $1;
- if (List::Util::first { $_ eq $possible_namespace }
- @{ $self->namespaces_names() })
- {
- $data{namespace} = $possible_namespace;
- }
- }
-
- $data{minor} = 0 unless defined($data{minor});
-
- return \%data;
-}
-
-#private functions with out OO interface
-sub make_good_tags {
- return {
- sitename => 1,
- base => 1,
- generator => 1,
- case => 1,
- namespace => 1,
- title => 1,
- id => 1,
- timestamp => 1,
- username => 1,
- comment => 1,
- text => 1
- };
-}
-
-sub token2text {
- my ($token) = @_;
-
- if (ref $token eq 'ARRAY') {
- return "<$$token[0]>";
- } elsif (ref $token eq 'SCALAR') {
- return "!text_token!";
- } else {
- return "!unknown!";
- }
-}
-
-#this function is where the majority of time is spent in this software
-#sub token_compare {
-# my ($toke1, $toke2) = @_;
-#
-# foreach my $i (0 .. $#$toke2) {
-# if ($$toke1[$i] ne $$toke2[$i]) {
-# return 0;
-# }
-# }
-#
-# return 1;
-#}
-
-sub start_handler {
- my ($p, $tag, %atts) = @_;
- my $self = $p->{state};
- my $good_tags = $$self{GOOD_TAGS};
-
- push @{ $$self{BUFFER} }, [$tag, \%atts];
-
- if (defined($good_tags->{$tag})) {
- $p->setHandlers(Char => \&char_handler);
- }
-
- return 1;
-}
-
-sub end_handler {
- my ($p, $tag) = @_;
- my $self = $p->{state};
-
- push @{ $$self{BUFFER} }, ["/$tag"];
-
- $p->setHandlers(Char => undef);
-
- return 1;
-}
-
-sub char_handler {
- my ($p, $chars) = @_;
- my $self = $p->{state};
- my $buffer = $$self{BUFFER};
- my $curent = $$buffer[-1];
-
- if (ref $curent eq 'SCALAR') {
- $$curent .= $chars;
- } elsif (substr($$curent[0], 0, 1) ne '/') {
- push(@$buffer, \$chars);
- }
-
- return 1;
-}
+
1;
@@ -946,3 +287,9 @@
}
}
+=head1 LIMITATIONS
+
+=head2 Memory Leak
+
+This class is not performing proper garbage collection at destruction and will leak memory like crazy if
+multiple instances of it are created inside one perl script.
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Revisions.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Revisions.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Revisions.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/Revisions.pm Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
package Parse::MediaWikiDump::Revisions;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
use 5.8.0;
@@ -28,6 +28,7 @@
$self->open($source);
$self->init;
+ #return Object::Destroyer($self, 'cleanup');
return $self;
}
@@ -105,14 +106,16 @@
#private functions with OO interface
-#sub cleanup {
-# my ($self) = @_;
-#
-# warn "executing cleanup";
-#
-## $self->{EXPAT} = undef;
-## $self->{XML} = undef;
-#}
+sub cleanup {
+ my ($self) = @_;
+
+ warn "executing cleanup";
+
+ $self->{EXPAT}->setHandlers(Init => undef, Final => undef, Start => undef,
+ End => undef, Char => undef);
+ $self->{EXPAT}->parse_done;
+ #$self->{XML} = undef;
+}
sub open {
my ($self, $source) = @_;
@@ -137,12 +140,13 @@
$self->{XML} = $self->new_accumulator_engine;
my $expat_bb = $$self{XML}->parser->parse_start();
- $$self{EXPAT} = Object::Destroyer->new($expat_bb, 'parse_done');
+ #$$self{EXPAT} = Object::Destroyer->new($expat_bb, 'parse_done'); #causes exceptions not to be thrown
+ $$self{EXPAT} = $expat_bb;
#load the information from the siteinfo section so it is available before
#someone calls ->next
while(1) {
- if (scalar(@{$self->{PAGE_LIST}}) > 1) {
+ if (scalar(@{$self->{PAGE_LIST}}) > 0) {
last;
}
@@ -394,3 +398,10 @@
return $title;
}
+
+=head1 LIMITATIONS
+
+=head2 Memory Leak
+
+This class is not performing proper garbage collection at destruction and will leak memory like crazy if
+multiple instances of it are created inside one perl script.
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/XML.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/XML.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/XML.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/XML.pm Sun Sep 20 19:31:22 2009
@@ -2,7 +2,7 @@
#testing is done and documentation is written
package Parse::MediaWikiDump::XML::Accumulator;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
use warnings;
use strict;
@@ -64,6 +64,7 @@
$self->{node_stack} = [ $root ];
return Object::Destroyer->new($self, 'cleanup');
+ #return $self;
}
sub cleanup {
@@ -76,10 +77,12 @@
sub init_parser {
my ($self) = @_;
+ #warn "init_parser called";
+
my $parser = XML::Parser->new(
Handlers => {
- Init => sub { handle_init_event($self, @_) },
- Final => sub { handle_final_event($self, @_) },
+ #Init => sub { handle_init_event($self, @_) },
+ #Final => sub { handle_final_event($self, @_) },
Start => sub { handle_start_event($self, @_) },
End => sub { handle_end_event($self, @_) },
Char => sub { handle_char_event($self, @_) },
@@ -120,7 +123,7 @@
my $element_stack = $self->{element_stack};
my $node = $self->node;
my $matched = $node->{children}->{$element};
- my $handler;
+ my $handler;
if (! defined($matched)) {
die "fatal error - no match for element $element";
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/category_link.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/category_link.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/category_link.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/category_link.pm Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
package Parse::MediaWikiDump::category_link;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
#you must pass in a fully populated link array reference
sub new {
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/link.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/link.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/link.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/link.pm Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
package Parse::MediaWikiDump::link;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
#you must pass in a fully populated link array reference
sub new {
Modified: trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/page.pm
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/page.pm?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/page.pm (original)
+++ trunk/libparse-mediawikidump-perl/lib/Parse/MediaWikiDump/page.pm Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
package Parse::MediaWikiDump::page;
-our $VERSION = '0.92';
+our $VERSION = '0.93';
use strict;
use warnings;
Modified: trunk/libparse-mediawikidump-perl/t/pages.t
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/t/pages.t?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/t/pages.t (original)
+++ trunk/libparse-mediawikidump-perl/t/pages.t Sun Sep 20 19:31:22 2009
@@ -1,6 +1,6 @@
#!perl -w
-use Test::Simple tests => 82;
+use Test::Simple tests => 94;
use strict;
use Parse::MediaWikiDump;
@@ -43,7 +43,7 @@
ok($pages->current_byte != 0);
if ($mode eq 'file') {
- ok($pages->size == 2872);
+ ok($pages->size == 2874);
} elsif ($mode eq 'handle') {
ok(! defined($pages->size))
} else {
@@ -53,24 +53,32 @@
ok($page->title eq 'Talk:Title Test Value');
ok($page->id == 1);
- ok($page->timestamp eq '2005-07-09T18:41:10Z');
+ ok($page->revision_id == 47084);
ok($page->username eq 'Username Test Value');
ok($page->userid == 1292);
+ ok($page->timestamp eq '2005-07-09T18:41:10Z');
+ ok($page->userid == 1292);
+ ok($page->minor);
ok($$text eq "Text Test Value\n");
ok($page->namespace eq 'Talk');
+ ok(! defined($page->redirect));
ok(! defined($page->categories));
}
sub test_two {
my $page = $pages->next;
+ my $text = $page->text;
- ok(defined($page));
- ok($page->redirect eq 'fooooo');
ok($page->title eq 'Title Test Value #2');
ok($page->id == 2);
+ ok($page->revision_id eq '47085');
+ ok($page->username eq 'Username Test Value 2');
ok($page->timestamp eq '2005-07-09T18:41:10Z');
- ok($page->username eq 'Username Test Value');
ok($page->userid == 1292);
+ ok($page->minor);
+ ok($$text eq "#redirect : [[fooooo]]\n");
+ ok($page->namespace eq '');
+ ok($page->redirect eq 'fooooo');
ok(! defined($page->categories));
}
@@ -78,13 +86,12 @@
my $page = $pages->next;
ok(defined($page));
- ok($page->redirect eq 'fooooo');
+ ok($page->redirect);
ok($page->title eq 'Title Test Value #3');
ok($page->id == 3);
ok($page->timestamp eq '2005-07-09T18:41:10Z');
ok($page->username eq 'Username Test Value');
ok($page->userid == 1292);
- ok(! defined($page->categories));
}
sub test_four {
Modified: trunk/libparse-mediawikidump-perl/t/pages_test.xml
URL: http://svn.debian.org/wsvn/pkg-perl/trunk/libparse-mediawikidump-perl/t/pages_test.xml?rev=44447&op=diff
==============================================================================
--- trunk/libparse-mediawikidump-perl/t/pages_test.xml (original)
+++ trunk/libparse-mediawikidump-perl/t/pages_test.xml Sun Sep 20 19:31:22 2009
@@ -45,7 +45,7 @@
<revision>
<id>47085</id>
<timestamp>2005-07-09T18:41:10Z</timestamp>
- <contributor><username>Username Test Value</username><id>1292</id></contributor>
+ <contributor><username>Username Test Value 2</username><id>1292</id></contributor>
<minor/>
<comment>Comment Test Value</comment>
<text xml:space="preserve">#redirect : [[fooooo]]
More information about the Pkg-perl-cvs-commits
mailing list