[geneagrapher] 153/226: Incorporated BeautifulSoup into Grabber.
Doug Torrance
dtorrance-guest at moszumanska.debian.org
Sat Jul 11 17:10:57 UTC 2015
This is an automated email from the git hooks/post-receive script.
dtorrance-guest pushed a commit to branch master
in repository geneagrapher.
commit 3b202b07882135aa48e492db646a7a23d401be27
Author: David Alber <alber.david at gmail.com>
Date: Sun Oct 30 21:13:17 2011 -0700
Incorporated BeautifulSoup into Grabber.
---
geneagrapher/grabber.py | 70 +++++++++++++++++--------------------------
setup.py | 2 +-
tests/test_grabber_methods.py | 10 ++-----
3 files changed, 32 insertions(+), 50 deletions(-)
diff --git a/geneagrapher/grabber.py b/geneagrapher/grabber.py
index eebb3e0..06cf1b5 100644
--- a/geneagrapher/grabber.py
+++ b/geneagrapher/grabber.py
@@ -1,6 +1,7 @@
import urllib
import re
from htmlentitydefs import name2codepoint
+from BeautifulSoup import BeautifulSoup
class Grabber:
"""
@@ -9,7 +10,6 @@ class Grabber:
"""
def __init__(self, id):
self.id = id
- self.pagestr = None
self.name = None
self.institution = None
self.year = None
@@ -25,11 +25,8 @@ class Grabber:
"""
Grab the page for self.id from the Math Genealogy Database.
"""
- if self.pagestr is None:
- url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
- page = urllib.urlopen(url)
- self.pagestr = page.read()
- self.pagestr = self.pagestr.decode('utf-8')
+ url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
+ return urllib.urlopen(url)
def extract_node_information(self):
"""
@@ -37,51 +34,40 @@ class Grabber:
advisor ids, the mathematician name, the mathematician
institution, and the year of the mathematician's degree.
"""
- if self.pagestr is None:
- self.get_page()
+ page = self.get_page()
+ soup = BeautifulSoup(page, convertEntities='html')
+ page.close()
self.advisors = []
self.descendants = []
- # Split the page string at newline characters.
- psarray = self.pagestr.split('\n')
-
- if psarray[0].find("You have specified an ID that does not exist in the database. Please back up and try again.") > -1:
+ if soup.firstText().text == u"You have specified an ID that does not exist in the database. Please back up and try again.":
# Then a bad URL (e.g., a bad record id) was given. Throw an exception.
msg = "Invalid page address for id {}".format(self.id)
raise ValueError(msg)
- lines = iter(psarray)
- for line in lines:
- if line.find('h2 style=') > -1:
- line = lines.next()
- self.name = self.unescape(line.split('</h2>')[0].strip())
+ # Get mathematician name.
+ self.name = soup.find('h2').getText()
- if '#006633; margin-left: 0.5em">' in line:
- inst_year = line.split('#006633; margin-left: 0.5em">')[1].split("</span>")[:2]
- self.institution = self.unescape(inst_year[0].strip())
- if self.institution == u"":
- self.institution = None
- if inst_year[1].split(',')[0].strip().isdigit():
- self.year = int(inst_year[1].split(',')[0].strip())
+ # Get institution name (or None, if it there is no institution name).
+ self.institution = soup.find('div', style="line-height: 30px; text-align: center; margin-bottom: 1ex").find('span').find('span').text
+ if self.institution == u'':
+ self.institution = None
- if 'Advisor' in line:
- advisorLine = line
- while 'Advisor' in advisorLine:
- if 'a href=\"id.php?id=' in line:
- # Extract link to advisor page.
- advisor_id = int(advisorLine.split('a href=\"id.php?id=')[1].split('\">')[0])
- self.advisors.append(advisor_id)
- advisorLine = advisorLine.split(str(advisor_id))[1]
- else:
- # We are done. Adjust string to break the loop.
- # (Without this records with no advisor enter an infinite loop.)
- advisorLine = ""
+ # Get graduation year, if present.
+ inst_year = soup.find('div', style="line-height: 30px; text-align: center; margin-bottom: 1ex").find('span').contents[-1].strip()
+ if inst_year.isdigit():
+ self.year = int(inst_year)
- if '<tr ' in line:
- descendant_id = int(line.split('a href=\"id.php?id=')[1].split('\">')[0])
- self.descendants.append(descendant_id)
-
- if 'According to our current on-line database' in line:
- break
+ # Get advisor IDs.
+ for advisor_info in soup.findAll(text=re.compile('Advisor')):
+ if 'Advisor: Unknown' not in advisor_info:
+ advisor_id = advisor_info.findNext().attrs[0][-1].split('=')[1]
+ self.advisors.append(int(advisor_id))
+
+ # Get descendant IDs.
+ if soup.find('table') is not None:
+ for descendant_info in soup.find('table').findAll('a'):
+ descendant_id = descendant_info.attrs[0][-1].split('=')[-1]
+ self.descendants.append(int(descendant_id))
return [self.name, self.institution, self.year, self.advisors, self.descendants]
diff --git a/setup.py b/setup.py
index 1b7b1df..79b0e38 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ genealogies using data from the Math Genealogy Project's website.
url="http://www.davidalber.net/",
license="MIT",
packages=find_packages(exclude='tests'),
- install_requires=[],
+ install_requires=['BeautifulSoup >= 3.2.0'],
entry_points = {
'console_scripts': [
'ggrapher = geneagrapher.geneagrapher:ggrapher'
diff --git a/tests/test_grabber_methods.py b/tests/test_grabber_methods.py
index 991e765..e77fa86 100644
--- a/tests/test_grabber_methods.py
+++ b/tests/test_grabber_methods.py
@@ -11,7 +11,6 @@ class TestGrabberMethods(unittest.TestCase):
def test001_init(self):
# Test constructor.
self.assertEquals(self.grabber.id, 18231)
- self.assertEquals(self.grabber.pagestr, None)
self.assertEquals(self.grabber.name, None)
self.assertEquals(self.grabber.institution, None)
self.assertEquals(self.grabber.year, None)
@@ -20,12 +19,9 @@ class TestGrabberMethods(unittest.TestCase):
def test002_get_page(self):
# Test get_page() method.
- self.grabber.get_page()
- self.assert_(self.grabber.pagestr is not None)
- self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
- # Get page again and test for adverse affects.
- self.grabber.get_page()
- self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
+ page = self.grabber.get_page()
+ pagestr = page.read()
+ self.assert_("<title>The Mathematics Genealogy Project - Carl Gau" in pagestr)
def test003_extract_info_bad(self):
# Verify exception thrown for bad id.
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/geneagrapher.git
More information about the debian-science-commits
mailing list