r10185 - in /branches/upstream/libhtml-simpleparse-perl: ./ current/ current/lib/ current/lib/HTML/ current/t/

vdanjean at users.alioth.debian.org vdanjean at users.alioth.debian.org
Sat Dec 1 11:58:22 UTC 2007


Author: vdanjean
Date: Sat Dec  1 11:58:22 2007
New Revision: 10185

URL: http://svn.debian.org/wsvn/?sc=1&rev=10185
Log:
[svn-inject] Installing original source of libhtml-simpleparse-perl

Added:
    branches/upstream/libhtml-simpleparse-perl/
    branches/upstream/libhtml-simpleparse-perl/current/
    branches/upstream/libhtml-simpleparse-perl/current/Build.PL
    branches/upstream/libhtml-simpleparse-perl/current/Changes
    branches/upstream/libhtml-simpleparse-perl/current/MANIFEST
    branches/upstream/libhtml-simpleparse-perl/current/META.yml
    branches/upstream/libhtml-simpleparse-perl/current/Makefile.PL
    branches/upstream/libhtml-simpleparse-perl/current/README
    branches/upstream/libhtml-simpleparse-perl/current/lib/
    branches/upstream/libhtml-simpleparse-perl/current/lib/HTML/
    branches/upstream/libhtml-simpleparse-perl/current/lib/HTML/SimpleParse.pm
    branches/upstream/libhtml-simpleparse-perl/current/t/
    branches/upstream/libhtml-simpleparse-perl/current/t/basic.t

Added: branches/upstream/libhtml-simpleparse-perl/current/Build.PL
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/Build.PL?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/Build.PL (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/Build.PL Sat Dec  1 11:58:22 2007
@@ -1,0 +1,9 @@
+use Module::Build;
+
+my $b = new Module::Build
+  (
+   module_name => 'HTML::SimpleParse',
+   license => 'perl',
+  );
+
+$b->create_build_script;

Added: branches/upstream/libhtml-simpleparse-perl/current/Changes
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/Changes?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/Changes (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/Changes Sat Dec  1 11:58:22 2007
@@ -1,0 +1,99 @@
+Revision history for Perl extension HTML::SimpleParse.
+
+0.12  Wed Jul  9 12:19:38 CDT 2003
+
+ - Clarify the relationship between this module and HTML::TreeBuilder
+   in the documentation. [suggested by Gisle Aas]
+
+ - Moved regression tests from test.pl to t/basic.t
+
+0.11  Sun Jan 26 10:00:41 CST 2003
+
+ - Use Test.pm to output testing results.
+
+ - Avoid an 'undefined value' warning when creating a SimpleParse
+   object with an empty string.
+
+ - Fixed a problem that caused an infinite loop in certain bizarre
+   (and as yet unduplicated by me) situations.  Reported by Peter Suschlik.
+
+ - Added a Build.PL script to build & install via Module::Build.
+
+0.10  Sat Jul  1 14:47:07 EDT 2000
+   - Added the 'offset' attribute to each tree element, which tells you how
+     many bytes the current tag is from the beginning of the HTML text.
+     Idea from azamani at design-matters.com (Adrien Zamani).
+
+   - &parse_args will now recognize bare end tags correctly.  Thanks to
+     Paul.Makepeace at realprogrammers.com (Paul Makepeace).
+
+0.09  Sat May 27 12:18:38 EDT 2000
+   - HTML tag attribute names should contain only letters, digits, periods,
+     or hyphens.  I've made this change to parse_args().
+
+   - parse_args() handles single-quoted data now. [tim.holt at qsent.com (Tim Holt)]
+
+   - parse_args() now respects a new flag 'fix_case', which can be set
+     either as a package global or a class data member, which controls
+     whether attribute names are upper-cased or lower-cased or left alone.
+     Previously everything was left alone, which was incorrect because HTML
+     tag attribute names are supposed to be case-insensitive.  Now the
+     default is to upper-case everything.  [spot: tim.holt at qsent.com (Tim Holt)]
+
+0.08  Sat Jan 29 03:16:48 EST 2000
+    - Avoid warning when new() is called without feeding HTML to it
+      [jerome.oneil at activeindexing.com (Jerome O'Neil)]
+
+0.07  Wed Dec  1 17:09:23 EST 1999
+
+    - The parse_args routine will now allow whitespace between an attribute
+      and its value (on both sides of the equal sign).  I believe this
+      is compliant with the SGML spec at
+      http://www.w3.org/MarkUp/SGML/sgml-lex/sgml.l .  Thanks to
+      jo at homepage-design.net (Joachim Seibert) for the spot.
+      
+0.06  Fri Feb  5 18:22:15 EST 1999
+    - Fixed a bug in the parse_args routine that should properly handle
+      escaping of parameters like var="a \"value\"" [thanks, Philippe
+      Chiasson]
+
+0.05  Thu Aug 20 12:11:53 EDT 1998
+    - Fixed a bug in HTML::SimpleParse->parse_args('a="b=c"').  Thanks
+      to Shenghuo ZHU for finding it.
+
+0.04  Tue Aug 18 19:46:01 EDT 1998
+
+    - added execute() method that lets you do:
+      foreach ($p->tree) {
+         print $p->execute($_);
+      }
+      This lets Apache::OutputChain and Apache::SSIChain output their results in
+      the correct order.
+
+0.03  Fri Aug  7 13:03:34 EDT 1998
+    - changed the output_* methods so that they return their
+      output, instead of printing it.  There's a new method, get_output(),
+      which returns a string containing the parsed output.  The
+      output() method then prints the result of get_output().
+      This lets the module work with Apache::OutputChain.  Thanks to 
+      Honza Pazdziora for the patches (which I modified a little).
+    
+    - fixed the processing of markup declarations (e.g. <!DOCTYPE ...>)
+    
+    - parse_args method can handle leading whitespace now:
+      my %hash=HTML::SimpleParse->parse_args(' A="xx" B');
+      used to return an empty hash (thanks to Ben Laurie again).
+      
+    - parse_args makes sure that it starts matching at the beginning
+      of the string you give it, by setting pos() to 0.
+    
+    I'm also going to update Apache::SSI so that it can use this new version.
+
+0.02  Thu Aug  6 16:08:22 EDT 1998
+    - fixed bug in parse_args method pointed out by Ben Laurie:
+       %hash=HTML::SimpleParse->parse_args('A="xx" B');
+       did not include B in %hash.
+
+0.01  Fri Jun 26 17:42:41 1998
+    - original version; created by h2xs 1.18
+

Added: branches/upstream/libhtml-simpleparse-perl/current/MANIFEST
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/MANIFEST?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/MANIFEST (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/MANIFEST Sat Dec  1 11:58:22 2007
@@ -1,0 +1,8 @@
+Build.PL
+Changes
+MANIFEST			This list of files
+META.yml
+Makefile.PL
+README
+lib/HTML/SimpleParse.pm
+t/basic.t

Added: branches/upstream/libhtml-simpleparse-perl/current/META.yml
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/META.yml?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/META.yml (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/META.yml Sat Dec  1 11:58:22 2007
@@ -1,0 +1,14 @@
+--- #YAML:1.0
+name: HTML-SimpleParse
+version: 0.12
+license: perl
+distribution_type: module
+requires: {}
+recommends: {}
+build_requires: {}
+conflicts: {}
+provides:
+  HTML::SimpleParse:
+    file: lib/HTML/SimpleParse.pm
+    version: 0.12
+generated_by: Module::Build version 0.18_02

Added: branches/upstream/libhtml-simpleparse-perl/current/Makefile.PL
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/Makefile.PL?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/Makefile.PL (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/Makefile.PL Sat Dec  1 11:58:22 2007
@@ -1,0 +1,9 @@
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    'NAME'	=> 'HTML::SimpleParse',
+    'VERSION_FROM' => 'lib/HTML/SimpleParse.pm', # finds $VERSION
+    'PL_FILES'  => {},
+    'dist' => { COMPRESS=>"gzip", SUFFIX=>"gz" },
+);

Added: branches/upstream/libhtml-simpleparse-perl/current/README
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/README?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/README (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/README Sat Dec  1 11:58:22 2007
@@ -1,0 +1,43 @@
+Hi,
+
+This is the HTML::SimpleParse module.  It is a bare-bones HTML parser,
+similar to HTML::Parser, but with a couple important distinctions:
+
+
+First, HTML::Parser knows which tags can contain other tags, which start
+tags have corresponding end tags, which tags can exist only in the <HEAD>
+portion of the document, and so forth.  HTML::SimpleParse does not know any
+of these things.  It just finds tags and text in the HTML you give it, it
+does not care about the specific content of these tags (though it does
+distiguish between different _types_ of tags, such as comments, starting
+tags like <b>, ending tags like </b>, and so on).
+
+Second, HTML::SimpleParse does not create a hierarchical tree of HTML
+content, but rather a simple linear list.  It does not pay any attention to
+balancing start tags with corresponding end tags, or which pairs of tags are
+inside other pairs of tags.
+
+Because of these characteristics, you can make a very effective HTML filter
+by sub-classing HTML::SimpleParse.  For example, to remove all comments:
+
+ package NoComment;
+ use HTML::SimpleParse;
+ @ISA = qw(HTML::SimpleParse);
+ sub output_comment {}
+ 
+ package main;
+ NoComment->new($some_html)->output;
+
+
+For more specific information, please see the documentation inside
+SimpleParse.pm, by doing "pod2txt SimpleParse.pm", or "perldoc
+HTML::SimpleParse" once you've installed the module.
+
+To install the module, do the usual:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+-Ken Williams

Added: branches/upstream/libhtml-simpleparse-perl/current/lib/HTML/SimpleParse.pm
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/lib/HTML/SimpleParse.pm?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/lib/HTML/SimpleParse.pm (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/lib/HTML/SimpleParse.pm Sat Dec  1 11:58:22 2007
@@ -1,0 +1,453 @@
+package HTML::SimpleParse;
+
+use strict;
+use vars qw($VERSION $FIX_CASE);
+
+$VERSION = '0.12';
+my $debug = 0;
+
+sub new {
+  my $pack = shift;
+  
+  my $self = bless {
+		    'text' => shift(),
+		    'tree' => [],
+		    @_
+		   }, $pack;
+  
+  $self->parse() if defined $self->{'text'} and length $self->{'text'};
+  return $self;
+}
+
+sub text {
+  my $self = shift;
+  $self->{'text'} = shift if @_;
+  return $self->{'text'};
+}
+
+sub tree { @{$_[0]->{'tree'}} }
+
+sub parse {
+  # Much of this is a dumbed-down version of HTML::Parser::parse.
+  
+  my $self = shift;
+  my $text = \ $self->{'text'};
+  my $tree = $self->{'tree'};
+  
+  # Parse html text in $$text.  The strategy is to remove complete
+  # tokens from the beginning of $$text until we can't decide whether
+  # it is a token or not, or the $$text is empty.
+  
+  @$tree = ();
+  while (1) {
+    my ($content, $type);
+    
+    # First we try to pull off any plain text (anything before a "<" char)
+    if ($$text =~ /\G([^<]+)/gcs) {
+      $content = $1; $type = 'text';
+      
+      # Then, SSI, comments, and markup declarations (usually <!DOCTYPE...>)
+      # ssi:     <!--#stuff-->
+      # comment: <!--stuff-->
+      # markup:  <!stuff>
+    } elsif ($$text =~ /\G<(!--(\#?).*?--)>/gcs) {
+      $type = ($2 ? 'ssi' : 'comment');
+      $content = $1;
+      
+    } elsif ($$text =~ /\G<(!.*?)>/gcs) {
+      $type = 'markup';
+      $content = $1;
+      
+      # Then, look for an end tag
+    } elsif ($$text =~ m|\G<(/[a-zA-Z][a-zA-Z0-9\.\-]*\s*)>|gcs) {
+      $content = $1; $type = 'endtag';
+      
+      # Then, finally we look for a start tag
+      # We know the first char is <, make sure there's a >
+    } elsif ($$text =~ /\G<(.*?)>/gcs) {
+      $content = $1; $type = 'starttag';
+      
+    } else {
+      # the string is exhausted, or there's no > in it.
+      push @$tree, {
+		    'content' => substr($$text, pos $$text),
+		    'type'    => 'text',
+		   } unless pos($$text) eq length($$text);
+      last;
+    }
+    
+    push @$tree, {
+		  'content' => $content,
+		  'type'    => $type,
+		  'offset'  => ($type eq 'text' ? 
+				pos($$text) - length($content) : 
+				pos($$text) - length($content) - 2),
+		 };
+  }
+  
+  $self;
+}
+
+
+$FIX_CASE = 1;
+sub parse_args {
+  my $self = shift;  # Not needed here
+  my $str = shift;
+  my $fix_case = ((ref $self and exists $self->{fix_case}) ? $self->{fix_case} : $FIX_CASE);
+  my @returns;
+  
+  # Make sure we start searching at the beginning of the string
+  pos($str) = 0;
+  
+  while (1) {
+    next if $str =~ m/\G\s+/gc;  # Get rid of leading whitespace
+    
+    if ( $str =~ m/\G
+	 ([\w.-]+)\s*=\s*                         # the key
+	 (?:
+	  "([^\"\\]*  (?: \\.[^\"\\]* )* )"\s*    # quoted string, with possible whitespace inside,
+	  |                                       #  or
+	  '([^\'\\]*  (?: \\.[^\'\\]* )* )'\s*    # quoted string, with possible whitespace inside,
+	  |                                       #  or
+	  ([^\s>]*)\s*                            # anything else, without whitespace or >
+	 )/gcx ) {
+      
+      my ($key, $val) = ($1, $+);
+      $val =~ s/\\(.)/$1/gs;
+      push @returns, ($fix_case==1 ? uc($key) : $fix_case==-1 ? lc($key) : $key), $val;
+      
+    } elsif ( $str =~ m,\G/?([\w.-]+)\s*,gc ) {
+      push @returns, ($fix_case==1 ? uc($1)   : $fix_case==-1 ? lc($1)   : $1  ), undef;
+    } else {
+      last;
+    }
+  }
+  
+  return @returns;
+}
+
+
+sub execute {
+  my $self = shift;
+  my $ref = shift;
+  my $method = "output_$ref->{type}";
+  warn "calling $self->$method(...)" if $debug;
+  return $self->$method($ref->{content});
+}
+
+sub get_output {
+  my $self = shift;
+  my ($method, $out) = ('', '');
+  foreach ($self->tree) {
+    $out .= $self->execute($_);
+  }
+  return $out;
+}
+
+
+sub output {
+  my $self = shift;
+  my $method;
+  foreach ($self->tree) {
+    print $self->execute($_);
+  }
+}
+
+sub output_text      {   $_[1]; }
+sub output_comment   { "<$_[1]>"; }
+sub output_endtag    { "<$_[1]>"; }
+sub output_starttag  { "<$_[1]>"; }
+sub output_markup    { "<$_[1]>"; }
+sub output_ssi       { "<$_[1]>"; }
+
+1;
+__END__
+
+=head1 NAME
+
+HTML::SimpleParse - a bare-bones HTML parser
+
+=head1 SYNOPSIS
+
+ use HTML::SimpleParse;
+
+ # Parse the text into a simple tree
+ my $p = new HTML::SimpleParse( $html_text );
+ $p->output;                 # Output the HTML verbatim
+ 
+ $p->text( $new_text );      # Give it some new HTML to chew on
+ $p->parse                   # Parse the new HTML
+ $p->output;
+
+ my %attrs = HTML::SimpleParse->parse_args('A="xx" B=3');
+ # %attrs is now ('A' => 'xx', 'B' => '3')
+
+=head1 DESCRIPTION
+
+This module is a simple HTML parser.  It is similar in concept to HTML::Parser,
+but it differs from HTML::TreeBuilder in a couple of important ways.
+
+First, HTML::TreeBuilder knows which tags can contain other tags, which
+start tags have corresponding end tags, which tags can exist only in
+the <HEAD> portion of the document, and so forth.  HTML::SimpleParse
+does not know any of these things.  It just finds tags and text in the
+HTML you give it, it does not care about the specific content of these
+tags (though it does distiguish between different _types_ of tags,
+such as comments, starting tags like <b>, ending tags like </b>, and
+so on).
+
+Second, HTML::SimpleParse does not create a hierarchical tree of HTML content,
+but rather a simple linear list.  It does not pay any attention to balancing
+start tags with corresponding end tags, or which pairs of tags are inside other
+pairs of tags.
+
+Because of these characteristics, you can make a very effective HTML
+filter by sub-classing HTML::SimpleParse.  For example, to remove all comments 
+from HTML:
+
+ package NoComment;
+ use HTML::SimpleParse;
+ @ISA = qw(HTML::SimpleParse);
+ sub output_comment {}
+ 
+ package main;
+ NoComment->new($some_html)->output;
+
+Historically, I started the HTML::SimpleParse project in part because
+of a misunderstanding about HTML::Parser's functionality.  Many
+aspects of these two modules actually overlap.  I continue to maintain
+the HTML::SimpleParse module because people seem to be depending on
+it, and because beginners sometimes find HTML::SimpleParse to be
+simpler than HTML::Parser's more powerful interface.  People also seem
+to get a fair amount of usage out of the C<parse_args()> method
+directly.
+
+=head2 Methods
+
+=over 4
+
+=item * new
+
+ $p = new HTML::SimpleParse( $some_html );
+
+Creates a new HTML::SimpleParse object.  Optionally takes one argument,
+a string containing some HTML with which to initialize the object.  If
+you give it a non-empty string, the HTML will be parsed into a tree and 
+ready for outputting.
+
+Can also take a list of attributes, such as
+
+ $p = new HTML::SimpleParse( $some_html, 'fix_case' => -1);
+
+See the C<parse_args()> method below for an explanation of this attribute.
+
+=item * text
+
+ $text = $p->text;
+ $p->text( $new_text );
+
+Get or set the contents of the HTML to be parsed.
+
+=item * tree
+
+ foreach ($p->tree) { ... }
+
+Returns a list of all the nodes in the tree, in case you want to step
+through them manually or something.  Each node in the tree is an
+anonymous hash with (at least) three data members, $node->{type} (is
+this a comment, a start tag, an end tag, etc.), $node->{content} (all
+the text between the angle brackets, verbatim), and $node->{offset}
+(number of bytes from the beginning of the string).
+
+The possible values of $node->{type} are C<text>, C<starttag>,
+C<endtag>, C<ssi>, and C<markup>.
+
+=item * parse
+
+ $p->parse;
+
+Once an object has been initialized with some text, call $p->parse and 
+a tree will be created.  After the tree is created, you can call $p->output.
+If you feed some text to the new() method, parse will be called automatically
+during your object's construction.
+
+=item * parse_args
+
+ %hash = $p->parse_args( $arg_string );
+
+This routine is handy for parsing the contents of an HTML tag into key=value
+pairs.  For instance:
+
+  $text = 'type=checkbox checked name=flavor value="chocolate or strawberry"';
+  %hash = $p->parse_args( $text );
+  # %hash is ( TYPE=>'checkbox', CHECKED=>undef, NAME=>'flavor',
+  #            VALUE=>'chocolate or strawberry' )
+
+Note that the position of the last m//g search on the string (the value 
+returned by Perl's pos() function) will be altered by the parse_args function,
+so make sure you take that into account if (in the above example) you do
+C<$text =~ m/something/g>.
+
+The parse_args() method can be run as either an object method or as a
+class method, i.e. as either $p->parse_args(...) or
+HTML::SimpleParse->parse_args(...).
+
+HTML attribute lists are supposed to be case-insensitive with respect
+to attribute names.  To achieve this behavior, parse_args() respects
+the 'fix_case' flag, which can be set either as a package global
+$FIX_CASE, or as a class member datum 'fix_case'.  If set to 0, no
+case conversion is done.  If set to 1, all keys are converted to upper
+case.  If set to -1, all keys are converted to lower case.  The
+default is 1, i.e. all keys are uppercased.
+
+If an attribute takes no value (like "checked" in the above example) then it
+will still have an entry in the returned hash, but its value will be C<undef>.
+For example:
+
+  %hash = $p->parse_args('type=checkbox checked name=banana value=""');
+  # $hash{CHECKED} is undef, but $hash{VALUE} is ""
+
+This method actually returns a list (not a hash), so duplicate attributes and
+order will be preserved if you want them to be:
+
+ @hash = $p->parse_args("name=family value=gwen value=mom value=pop");
+ # @hash is qw(NAME family VALUE gwen VALUE mom VALUE pop)
+
+=item * output
+
+ $p->output;
+
+This will output the contents of the HTML, passing the real work off to
+the output_text, output_comment, etc. functions.  If you do not override any
+of these methods, this module will output the exact text that it parsed into
+a tree in the first place.
+
+=item * get_output
+
+ print $p->get_output
+
+Similar to $p->output(), but returns its result instead of printing it.
+
+=item * execute
+
+ foreach ($p->tree) {
+    print $p->execute($_);
+ }
+
+Executes a single node in the HTML parse tree.  Useful if you want to loop
+through the nodes and output them individually.
+
+=back
+
+The following methods do the actual outputting of the various parts of
+the HTML.  Override some of them if you want to change the way the HTML
+is output.  For instance, to strip comments from the HTML, override the
+output_comment method like so:
+
+ # In subclass:
+ sub output_comment { }  # Does nothing
+
+=over 4
+
+=item * output_text
+
+=item * output_comment
+
+=item * output_endtag
+
+=item * output_starttag
+
+=item * output_markup
+
+=item * output_ssi
+
+
+=back
+
+=head1 CAVEATS
+
+Please do not assume that the interface here is stable.  This is a first pass, 
+and I'm still trying to incorporate suggestions from the community.  If you
+employ this module somewhere, make doubly sure before upgrading that none of your
+code breaks when you use the newer version.
+
+
+=head1 BUGS
+
+=over 4
+
+=item * Embedded >s are broken
+
+Won't handle tags with embedded >s in them, like
+<input name=expr value="x > y">.  This will be fixed in a future
+version, probably by using the parse_args method.  Suggestions are welcome.
+
+=back
+
+=head1 TO DO
+
+=over 4
+
+=item * extensibility
+
+Based on a suggestion from Randy Harmon (thanks), I'd like to make it easier
+for subclasses of SimpleParse to pick out other kinds of HTML blocks, i.e.
+extend the set {text, comment, endtag, starttag, markup, ssi} to include more
+members.  Currently the only easy way to do that is by overriding the 
+C<parse> method:
+
+ sub parse {  # In subclass
+    my $self = $_[0];
+    $self->SUPER::parse(@_);
+    foreach ($self->tree) {
+       if ($_->{content} =~ m#^a\s+#i) {
+          $_->{type} = 'anchor_start';
+       }
+    }
+ }
+
+ sub output_anchor_start {
+    # Whatever you want...
+ }
+
+Alternatively, this feature might be implemented by hanging attatchments 
+onto the parsing loop, like this:
+
+ my $parser = new SimpleParse( $html_text );
+ $regex = '<(a\s+.*?)>';
+ $parser->watch_for( 'anchor_start', $regex );
+ 
+ sub SimpleParse::output_anchor_start {
+    # Whatever you want...
+ }
+
+I think I like that idea better.  If you wanted to, you could make a subclass
+with output_anchor_start as one of its methods, and put the ->watch_for 
+stuff in the constructor.
+
+
+=item * reading from filehandles
+
+It would be nice if you could initialize an object by giving it a filehandle
+or filename instead of the text itself.
+
+=item * tests
+
+I need to write a few tests that run under "make test".
+
+
+=back
+
+
+=head1 AUTHOR
+
+Ken Williams <ken at forum.swarthmore.edu>
+
+=head1 COPYRIGHT
+
+Copyright 1998 Swarthmore College.  All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut

Added: branches/upstream/libhtml-simpleparse-perl/current/t/basic.t
URL: http://svn.debian.org/wsvn/branches/upstream/libhtml-simpleparse-perl/current/t/basic.t?rev=10185&op=file
==============================================================================
--- branches/upstream/libhtml-simpleparse-perl/current/t/basic.t (added)
+++ branches/upstream/libhtml-simpleparse-perl/current/t/basic.t Sat Dec  1 11:58:22 2007
@@ -1,0 +1,130 @@
+#!/usr/bin/perl -w
+
+use Test;
+BEGIN { plan tests => 23 }
+use HTML::SimpleParse;
+ok 1;
+
+use Carp;
+$SIG{__WARN__} = \&Carp::cluck;
+
+{
+	my %hash = HTML::SimpleParse->parse_args('A="xx" B=3');
+	ok $hash{A}, "xx";
+	ok $hash{B}, 3;
+}
+
+{
+	my %hash = HTML::SimpleParse->parse_args('A="xx" B');
+	ok $hash{A}, "xx";
+	ok exists $hash{B};
+}
+
+{
+	my %hash = HTML::SimpleParse->parse_args('A="xx" B c="hi" ');
+	ok $hash{A}, "xx";
+	ok exists $hash{B};
+	ok $hash{C}, "hi";
+}
+
+{
+	my $text = 'type=checkbox checked name=flavor value="chocolate or strawberry"';
+	my %hash = HTML::SimpleParse->parse_args( $text );
+	ok $hash{TYPE}, "checkbox";
+	ok exists $hash{CHECKED};
+	ok $hash{VALUE}, "chocolate or strawberry";
+}
+
+{
+	my %hash=HTML::SimpleParse->parse_args(' A="xx" B');
+	ok $hash{A}, 'xx';
+	ok exists $hash{B};
+}
+
+{
+	my $text = <<EOF;
+	<html><head>
+	<title>Hiya, tester</title>
+	</head>
+	
+	<body>
+	<center><h1>Hiya, tester</h1></center>
+	<!-- here is a comment -->
+	<!DOCTYPE here is a markup>
+	<!--# here is an ssi -->
+	</body>
+	</html>
+EOF
+	my $p = new HTML::SimpleParse( $text );
+	
+	ok $p->get_output(), $text;
+}
+
+{
+	my %hash = HTML::SimpleParse->parse_args('a="b=c"');
+	ok $hash{A}, "b=c";
+}
+
+{
+	my %hash = HTML::SimpleParse->parse_args('val="a \"value\""');
+	ok $hash{VAL}, 'a "value"';
+}
+
+{
+  my %hash = HTML::SimpleParse->parse_args('val = "a \"value\""');
+  ok $hash{VAL}, 'a "value"';
+}
+
+{
+  # Avoid 'uninitialized value' warning
+  my $ok=1;
+  local $^W=1;
+  local $SIG{__WARN__} = sub {$ok=0};
+  HTML::SimpleParse->new();
+  ok $ok;
+}
+
+{
+  my %hash = HTML::SimpleParse->parse_args("val='a value'");
+  ok $hash{VAL}, 'a value';
+}
+
+{
+  local $HTML::SimpleParse::FIX_CASE = 0;
+  my %hash = HTML::SimpleParse->parse_args("val='a value'");
+  ok $hash{val}, 'a value';
+}
+
+{
+  local $HTML::SimpleParse::FIX_CASE = 0;
+  my %hash = HTML::SimpleParse->parse_args("Val='a value'");
+  ok $hash{Val}, 'a value';
+}
+
+{
+  my $p = new HTML::SimpleParse('', fix_case => 0);
+  my %hash = $p->parse_args("Val='a value'");
+  ok $hash{Val}, 'a value';
+}
+
+{
+  my $text = <<EOF;
+    <html><head>
+    <title>Hiya, tester</title>
+    </head>
+	
+    <body>
+    <center><h1>Hiya, tester</h1></center>
+    <!-- here is a comment -->
+    <!DOCTYPE here is a markup>
+    <!--# here is an ssi -->
+    </body>
+    </html>
+EOF
+  my $p = new HTML::SimpleParse($text);
+  my $ok = 1;
+  foreach ($p->tree) {
+    $ok = 0 unless substr($text, $_->{offset}) =~ /^<?\Q$_->{content}/;
+  }
+  ok $ok;
+}




More information about the Pkg-perl-cvs-commits mailing list