[libnet-openid-common-perl] 06/09: use HTML::Parser for parsing HTML

Sun Feb 7 21:50:29 UTC 2016

This is an automated email from the git hooks/post-receive script.

gregoa pushed a commit to annotated tag v1.12
in repository libnet-openid-common-perl.

commit 828d44005879bc24aa529b923b7b9b0059ce2feb
Author: Roger Crew <crew at cs.stanford.edu>
Date:   Tue Oct 25 11:24:14 2011 -0700

    use HTML::Parser for parsing HTML
    
    finally.
---
 dist.ini                 |   1 +
 lib/Net/OpenID/Common.pm |  90 ++++++++++++++++++++++++++-
 t/07-htmlparse.t         | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 245 insertions(+), 1 deletion(-)

diff --git a/dist.ini b/dist.ini
index d658870..3a46f2e 100644
--- a/dist.ini
+++ b/dist.ini
@@ -31,6 +31,7 @@ filenames = dist.ini
 [Prereqs]
 HTTP::Request  = 0
 HTTP::Status   = 0
+HTML::Parser   = 3.40
 XML::Simple    = 0
 Time::Local    = 0
 MIME::Base64   = 0
diff --git a/lib/Net/OpenID/Common.pm b/lib/Net/OpenID/Common.pm
index 2c4ea9c..4deea15 100644
--- a/lib/Net/OpenID/Common.pm
+++ b/lib/Net/OpenID/Common.pm
@@ -31,6 +31,7 @@ use Math::BigInt;
 use Time::Local ();
 use MIME::Base64 ();
 use URI::Escape ();
+use HTML::Parser ();
 
 use constant VERSION_1_NAMESPACE => "http://openid.net/signon/1.1";
 use constant VERSION_2_NAMESPACE => "http://specs.openid.net/auth/2.0";
@@ -199,7 +200,94 @@ sub get_dh {
     return $dh;
 }
 
+
+################################################################
 # HTML parsing
+#
+# This is a stripped-down version of HTML::HeadParser
+# that only recognizes <link> and <meta> tags
+
+our @_linkmeta_parser_options =
+  (
+   api_version => 3,
+   ignore_elements => [qw(script style base isindex command noscript title object)],
+
+   start_document_h
+   => [sub {
+           my($p) = @_;
+           $p->{first_chunk} = 0;
+           $p->{found} = {};
+       },
+       "self"],
+
+   end_h
+   => [sub {
+           my($p,$tag) = @_;
+           $p->eof if $tag eq 'head'
+       },
+       "self,tagname"],
+
+   start_h
+   => [sub {
+           my($p, $tag, $attr) = @_;
+           if ($tag eq 'meta' || $tag eq 'link') {
+               if ($tag eq 'link' && ($attr->{rel}||'') =~ m/\s/) {
+                   # split <link rel="foo bar..." href="whatever"... />
+                   # into multiple <link>s
+                   push @{$p->{found}->{$tag}},
+                     map { +{%{$attr}, rel => $_} }
+                       split /\s+/,$attr->{rel};
+               }
+               else {
+                   push @{$p->{found}->{$tag}}, $attr;
+               }
+           }
+           elsif ($tag ne 'head' && $tag ne 'html') {
+               # stop parsing
+               $p->eof;
+           }
+       },
+       "self,tagname,attr"],
+
+   text_h
+   => [sub {
+           my($p, $text) = @_;
+           unless ($p->{first_chunk}) {
+               # drop Unicode BOM if found
+               if ($p->utf8_mode) {
+                   $text =~ s/^\xEF\xBB\xBF//;
+               }
+               else {
+                   $text =~ s/^\x{FEFF}//;
+               }
+               $p->{first_chunk}++;
+           }
+           # Normal text outside of an allowed <head> tag
+           # means start of body
+           $p->eof if ($text =~ /\S/);
+       },
+       "self,text"],
+  );
+
+# XXX this line is also in HTML::HeadParser; do we need it?
+# current theory is we don't because we're requiring at
+# least version 3.40 which is already pretty ancient.
+# 
+# *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;
+
+our $_linkmeta_parser;
+
+# return { link => [links...], meta => [metas...] }
+# where each link/meta is a hash of the attribute values
+sub html_extract_linkmetas {
+    my $doc = shift;
+    $_linkmeta_parser ||= HTML::Parser->new(@_linkmeta_parser_options);
+    $_linkmeta_parser->parse($doc);
+    $_linkmeta_parser->eof;
+    return delete $_linkmeta_parser->{found};
+}
+
+### DEPRECATED, do not use, will be removed Real Soon Now
 sub _extract_head_markup_only {
     my $htmlref = shift;
 
@@ -209,7 +297,7 @@ sub _extract_head_markup_only {
     # kill all comments
     $$htmlref =~ s/<!--.*?-->//sg;
     # ***FIX?*** Strictly speaking, SGML comments must have matched
-    # pairs of '--'s but almost nobody checks for this or even knows 
+    # pairs of '--'s but almost nobody checks for this or even knows
 
     # trim everything past the body.  this is in case the user doesn't
     # have a head document and somebody was able to inject their own
diff --git a/t/07-htmlparse.t b/t/07-htmlparse.t
new file mode 100644
index 0000000..8080aeb
--- /dev/null
+++ b/t/07-htmlparse.t
@@ -0,0 +1,155 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+use Test::More;
+use Net::OpenID::Common;
+
+sub html_is { is_deeply(OpenID::util::html_extract_linkmetas(shift), at _) }
+
+html_is('plain text hello world',{},'plain') ;
+html_is('<body><link rel="boo" href="not real"></body>',{},'body');
+html_is('<link rel=boo href=real>',{link =>[{rel=>'boo',href=>'real'}]},'nohead');
+
+my $p1 = 'https://api.screenname.aol.com/auth/openidServer';
+my $doc1 = <<END ;
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"><html><head><link rel="openid.server" href="$p1"/><link rel="openid2.provider" href="$p1"/><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>AOL OpenId</title><meta http-equiv="refresh" content="0;url=https://api.screenname.aol.com/auth/openid/name/test"></head><body>If not redirected automatically, please click <a href="https://api.screenname.aol.com/auth/openid/name/test">h [...]
+END
+my $r1 = 
+{
+link => [
+ {rel=>"openid.server", href=>"$p1",'/'=>'/'},
+ {rel=>"openid2.provider", href=>"$p1",'/'=>'/'},
+],
+meta => [
+ {'http-equiv'=>"Content-Type", content=>"text/html; charset=UTF-8"},
+ {'http-equiv'=>"refresh", content=>"0;url=https://api.screenname.aol.com/auth/openid/name/test"},
+]
+};
+my $r0;
+html_is($doc1,$r1,'basic') ;
+done_testing();
+
+__END__
+
+my $uri2 = 'http://openid.example.com/everything_in_comments';
+addf_uri($uri2,content => <<END );
+<html><head><link rel="openid.server"   href="http://www.livejournal.com/misc/openid.bml" />
+<link rel="openid.delegate" href="http://openid1.net/delegate" />
+<link rel="openid2.provider" href="http://www.livejournal.com/misc/openid.bml" />
+<link rel="openid2.local_id" href="http://openid2.net/delegate" />
+<meta name="foaf:maker" content="foaf:mbox_sha1sum '4caa1d6f6203d21705a00a7aca86203e82a9cf7a'"/>
+<link rel="meta" type="application/rdf+xml" title="FOAF"
+      href="http://brad.livejournal.com/data/foaf" />
+<link rel="alternate" type="application/rss+xml" title="RSS"
+      href="http://www.livejournal.com/~brad/data/rss" />
+<link rel="alternate" type="application/atom+xml" title="Atom" 
+      href="http://www.livejournal.com/~brad/data/atom" /></head><body>Bite me</body></html>
+END
+is_deeply($csr->_find_semantic_info($uri2),
+{'openid.server'=>'http://www.livejournal.com/misc/openid.bml',
+ 'openid.delegate'=>'http://openid1.net/delegate',
+ 'openid2.provider'=>'http://www.livejournal.com/misc/openid.bml',
+ 'openid2.local_id'=>'http://openid2.net/delegate',
+ 'foaf.maker'=> "foaf:mbox_sha1sum '4caa1d6f6203d21705a00a7aca86203e82a9cf7a'",
+ 'foaf'=>"http://brad.livejournal.com/data/foaf",
+ 'rss'=>"http://www.livejournal.com/~brad/data/rss",
+ 'atom'=>"http://www.livejournal.com/~brad/data/atom",
+},'everything from consumer.pm comments' );
+
+
+my $uri3 = 'http://openid.example.com/cdata_crap';
+addf_uri($uri3,content => <<END );
+<html><head>
+<link rel="openid.server"   href="http://www.livejournal.com/misc/openid.bml" />
+<link rel="openid.delegate" href="http://openid1.net/delegate" />
+<script type="text/javascript">//<![CDATA[
+var toss = '
+<link rel="openid2.provider" href="http://www.livejournal.com/misc/openid2.bml" />
+<link rel="openid2.local_id" href="http://openid2.net/delegate" />
+<meta name="foaf:maker" content="foaf:mbox_sha1sum \'4caa1d6f6203d21705a00a7aca86203e82a9cf7a\'"/>
+'; // ]]>
+</script>
+<!-- <!---- comment me out
+<link rel="meta" type="application/rdf+xml" title="FOAF"
+     href="http://brad.livejournal.com/data/foaf" /> 
+oh and comments do not nest so the next one is real -->
+<link rel="alternate" type="application/rss+xml" title="RSS"
+      href="http://www.livejournal.com/~brad/data/rss" /> <!-- -->
+<style type="text/css"><![CDATA[
+hr { visibility:none msg:make sure the first CDATA is not grabbing too much }
+]]></style>
+<link rel="alternate" type="application/atom+xml" title="Atom" 
+      href="http://www.livejournal.com/~brad/data/atom" /></head><body>bitez moi</body></html>
+END
+is_deeply($csr->_find_semantic_info($uri3),
+{'openid.server'=>'http://www.livejournal.com/misc/openid.bml',
+ 'openid.delegate'=>'http://openid1.net/delegate',
+ 'rss'=>"http://www.livejournal.com/~brad/data/rss",
+ 'atom'=>"http://www.livejournal.com/~brad/data/atom",
+},'CDATA/comment silliness' );
+
+
+my $uri4 = 'http://openid.aol.com/oldstyle';
+addf_uri($uri4,content => <<END );
+<HTML><HEAD>
+<LINK REL=xopenid.serverx HREF="not it" />
+<LINK REL=openid.delegate HREF="http://openid1.net/delegate"></HEAD>
+<BODY><head><LINK REL=openid2.provider HREF="not it either"></head></BODY></HTML>
+END
+is_deeply($csr->_find_semantic_info($uri4),
+{'openid.delegate'=>'http://openid1.net/delegate'},'HTML 4.0- test');
+
+my $uri4a = 'http://openid.aol.com/oldstyle2';
+addf_uri($uri4a,content => <<END );
+<HTML><HEAD>
+<LINK REL=xopenid.serverx HREF="not it" />
+<LINK REL=openid.delegate HREF="http://openid1.net/delegate?x=1&y=2&z=3"></HEAD>
+<BODY><head><LINK REL=openid2.provider HREF="not it either"></head></BODY></HTML>
+END
+is_deeply($csr->_find_semantic_info($uri4a),
+{'openid.delegate'=>'http://openid1.net/delegate?x=1&y=2&z=3'},'HTML 4.0- test');
+
+my $uri5 = 'http://google.com/somewhere';
+addf_uri($uri5,content => <<END );
+<html>
+<head> <meta http-equiv="content-type" content="text/html; charset=utf-8"/> <title> OpenID for Google Accounts </title> <link rel="openid2.provider openid.server" href="http://openid-provider.appspot.com/joey%40kitenet.net" /> <link href="/static/base.css" rel="stylesheet" type="text/css"/> 
+</head><body>bye</body></html>
+END
+my $answer5 = 
+{'openid2.provider'=>'http://openid-provider.appspot.com/joey%40kitenet.net',
+ 'openid.server'=>'http://openid-provider.appspot.com/joey%40kitenet.net'
+};
+is_deeply($csr->_find_semantic_info($uri5), $answer5,'link with two refs in it');
+is_deeply($csr->_find_semantic_info($uri5), $answer5,'link with two refs in it(again)');
+addf_uri($uri5,content => 'randomness');
+is_deeply($csr->_find_semantic_info($uri5), $answer5,'link with two refs in it(yet again)');
+
+my $uri4b = 'http://openid.aol.com/oldstyle4b';
+addf_uri($uri4b,content => <<END );
+<HTML><HEAD>
+<LINK REL=xopenid.serverx HREF="not it" />
+<LINK REL=openid.delegate HREF="http://op&#x65;nid1.net/delegate?x=1&y=2&z=3"></HEAD>
+<BODY><head><LINK REL=openid2.provider HREF="not it either"></head></BODY></HTML>
+END
+is_deeply($csr->_find_semantic_info($uri4b),
+{'openid.delegate'=>'http://openid1.net/delegate?x=1&y=2&z=3'},'numerical entities');
+
+my $uri6 = 'http://google.com/somewhere6';
+addf_uri($uri6,content => <<END );
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
+  <link rel=openid2.provider href='http://openid.example.com/~user'>
+  <title>Nice test</title>
+  <form action="doit">
+  <p>Send me your comment:
+  <input type=text name=comment value='<html><head><link
+rel=openid2.provider href="http://bogous.example.net"></head>'>
+  <input type=submit>
+  </form>
+END
+is_deeply($csr->_find_semantic_info($uri6),
+{
+'openid2.provider' => 'http://openid.example.com/~user',
+},'headless injection example');
+
+1;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libnet-openid-common-perl.git