[geneagrapher] 153/226: Incorporated BeautifulSoup into Grabber.

Sat Jul 11 17:10:57 UTC 2015

This is an automated email from the git hooks/post-receive script.

dtorrance-guest pushed a commit to branch master
in repository geneagrapher.

commit 3b202b07882135aa48e492db646a7a23d401be27
Author: David Alber <alber.david at gmail.com>
Date:   Sun Oct 30 21:13:17 2011 -0700

    Incorporated BeautifulSoup into Grabber.
---
 geneagrapher/grabber.py       | 70 +++++++++++++++++--------------------------
 setup.py                      |  2 +-
 tests/test_grabber_methods.py | 10 ++-----
 3 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/geneagrapher/grabber.py b/geneagrapher/grabber.py
index eebb3e0..06cf1b5 100644
--- a/geneagrapher/grabber.py
+++ b/geneagrapher/grabber.py
@@ -1,6 +1,7 @@
 import urllib
 import re
 from htmlentitydefs import name2codepoint
+from BeautifulSoup import BeautifulSoup
 
 class Grabber:
     """
@@ -9,7 +10,6 @@ class Grabber:
     """
     def __init__(self, id):
         self.id = id
-        self.pagestr = None
         self.name = None
         self.institution = None
         self.year = None
@@ -25,11 +25,8 @@ class Grabber:
         """
         Grab the page for self.id from the Math Genealogy Database.
         """
-        if self.pagestr is None:
-            url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
-            page = urllib.urlopen(url)
-            self.pagestr = page.read()
-            self.pagestr = self.pagestr.decode('utf-8')
+        url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
+        return urllib.urlopen(url)
             
     def extract_node_information(self):
         """
@@ -37,51 +34,40 @@ class Grabber:
         advisor ids, the mathematician name, the mathematician
         institution, and the year of the mathematician's degree.
         """
-        if self.pagestr is None:
-            self.get_page()
+        page = self.get_page()
+        soup = BeautifulSoup(page, convertEntities='html')
+        page.close()
             
         self.advisors = []
         self.descendants = []
 
-        # Split the page string at newline characters.
-        psarray = self.pagestr.split('\n')
-        
-        if psarray[0].find("You have specified an ID that does not exist in the database. Please back up and try again.") > -1:
+        if soup.firstText().text == u"You have specified an ID that does not exist in the database. Please back up and try again.":
             # Then a bad URL (e.g., a bad record id) was given. Throw an exception.
             msg = "Invalid page address for id {}".format(self.id)
             raise ValueError(msg)
 
-        lines = iter(psarray)
-        for line in lines:
-            if line.find('h2 style=') > -1:
-                line = lines.next()
-                self.name = self.unescape(line.split('</h2>')[0].strip())
+        # Get mathematician name.
+        self.name = soup.find('h2').getText()
 
-            if '#006633; margin-left: 0.5em">' in line:
-                inst_year = line.split('#006633; margin-left: 0.5em">')[1].split("</span>")[:2]
-                self.institution = self.unescape(inst_year[0].strip())
-                if self.institution == u"":
-                    self.institution = None
-                if inst_year[1].split(',')[0].strip().isdigit():
-                    self.year = int(inst_year[1].split(',')[0].strip())
+        # Get institution name (or None, if it there is no institution name).
+        self.institution = soup.find('div', style="line-height: 30px; text-align: center; margin-bottom: 1ex").find('span').find('span').text
+        if self.institution == u'':
+            self.institution = None
 
-            if 'Advisor' in line:
-                advisorLine = line
-                while 'Advisor' in advisorLine:
-                    if 'a href=\"id.php?id=' in line:
-                        # Extract link to advisor page.
-                        advisor_id = int(advisorLine.split('a href=\"id.php?id=')[1].split('\">')[0])
-                        self.advisors.append(advisor_id)
-                        advisorLine = advisorLine.split(str(advisor_id))[1]
-                    else:
-                        # We are done. Adjust string to break the loop.
-                        # (Without this records with no advisor enter an infinite loop.)
-                        advisorLine = ""
+        # Get graduation year, if present.
+        inst_year = soup.find('div', style="line-height: 30px; text-align: center; margin-bottom: 1ex").find('span').contents[-1].strip()
+        if inst_year.isdigit():
+            self.year = int(inst_year)
 
-            if '<tr ' in line:
-                descendant_id = int(line.split('a href=\"id.php?id=')[1].split('\">')[0])
-                self.descendants.append(descendant_id)
-                
-            if 'According to our current on-line database' in line:
-                break
+        # Get advisor IDs.
+        for advisor_info in soup.findAll(text=re.compile('Advisor')):
+            if 'Advisor: Unknown' not in advisor_info:
+                advisor_id = advisor_info.findNext().attrs[0][-1].split('=')[1]
+                self.advisors.append(int(advisor_id))
+
+        # Get descendant IDs.
+        if soup.find('table') is not None:
+            for descendant_info in soup.find('table').findAll('a'):
+                descendant_id = descendant_info.attrs[0][-1].split('=')[-1]
+                self.descendants.append(int(descendant_id))
         return [self.name, self.institution, self.year, self.advisors, self.descendants]
diff --git a/setup.py b/setup.py
index 1b7b1df..79b0e38 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ genealogies using data from the Math Genealogy Project's website.
       url="http://www.davidalber.net/",
       license="MIT",
       packages=find_packages(exclude='tests'),
-      install_requires=[],
+      install_requires=['BeautifulSoup >= 3.2.0'],
       entry_points = {
         'console_scripts': [
             'ggrapher = geneagrapher.geneagrapher:ggrapher'
diff --git a/tests/test_grabber_methods.py b/tests/test_grabber_methods.py
index 991e765..e77fa86 100644
--- a/tests/test_grabber_methods.py
+++ b/tests/test_grabber_methods.py
@@ -11,7 +11,6 @@ class TestGrabberMethods(unittest.TestCase):
     def test001_init(self):
         # Test constructor.
         self.assertEquals(self.grabber.id, 18231)
-        self.assertEquals(self.grabber.pagestr, None)
         self.assertEquals(self.grabber.name, None)
         self.assertEquals(self.grabber.institution, None)
         self.assertEquals(self.grabber.year, None)
@@ -20,12 +19,9 @@ class TestGrabberMethods(unittest.TestCase):
 
     def test002_get_page(self):
         # Test get_page() method.
-        self.grabber.get_page()
-        self.assert_(self.grabber.pagestr is not None)
-        self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
-        # Get page again and test for adverse affects.
-        self.grabber.get_page()
-        self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
+        page = self.grabber.get_page()
+        pagestr = page.read()
+        self.assert_("<title>The Mathematics Genealogy Project - Carl Gau" in pagestr)
 
     def test003_extract_info_bad(self):
         # Verify exception thrown for bad id.

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/geneagrapher.git