[geneagrapher] 06/226: * Modified grab.py to implement a class called Grabber that behaves similarly to previous implementation, in most ways. Some changes to functionality were made. * Generated unit tests for the Grabber class. This resolves ticket #2.

Sat Jul 11 17:10:30 UTC 2015

This is an automated email from the git hooks/post-receive script.

dtorrance-guest pushed a commit to branch master
in repository geneagrapher.

commit e0f5eab2cd2ec45e2332472e753500a6284b23be
Author: David Alber <alber.david at gmail.com>
Date:   Wed Apr 9 06:07:18 2008 +0000

     * Modified grab.py to implement a class called Grabber that behaves similarly to previous implementation, in most ways. Some changes to functionality were made.
     * Generated unit tests for the Grabber class. This resolves ticket #2.
---
 src/grab.py  | 98 ++++++++++++++++++++++++++++++++++--------------------------
 src/tests.py | 68 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 42 deletions(-)

diff --git a/src/grab.py b/src/grab.py
index 3d0310f..d9fd0c3 100644
--- a/src/grab.py
+++ b/src/grab.py
@@ -1,57 +1,71 @@
 import urllib
+import re
+from htmlentitydefs import name2codepoint
 from GGraph import *
 
-#id = 7401
+class Grabber:
+    """
+    Class for grabbing and parsing mathematician information from
+    Math Genealogy Database.
+    """
+    def __init__(self, id):
+        self.id = id
+        self.pagestr = None
+        self.name = None
+        self.institution = None
+        self.year = None
+        self.advisors = []
 
-def extractNodeInformation(id, graph):
-    search_list = [id]
+    def unescape(self, s):
+        return re.sub('&(%s);' % '|'.join(name2codepoint),\
+                      lambda m: unichr(name2codepoint[m.group(1)]), s)
 
-    while len(search_list) > 0:
-        id = search_list.pop()
-        #url = 'http://genealogy.math.ndsu.nodak.edu/html/id.phtml?id=' + str(id)
-        url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(id)
-        #url = 'http://www.genealogy.ams.org/html/id.phtml?id=' + str(id)
-        page = urllib.urlopen(url)
+    def getPage(self):
+        """
+        Grab the page for self.id from the Math Genealogy Database.
+        """
+        if self.pagestr is None:
+            url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
+            page = urllib.urlopen(url)
+            self.pagestr = page.read()
+            self.pagestr = self.pagestr.decode('utf-8')
+            
+    def extractNodeInformation(self):
+        """
+        For the mathematician in this object, extract the list of
+        advisor ids, the mathematician name, the mathematician
+        institution, and the year of the mathematician's degree.
+        """
+        if self.pagestr is None:
+            self.getPage()
 
-        advisors = []
-        name = ''
-        institution = ''
-        year = -1
-
-        line = page.readline()
-        if line.find("<html>An error occurred in the forwarding block") > -1:
+        # Split the page string at newline characters.
+        psarray = self.pagestr.split('\n')
+        
+        if psarray[0].find("An error occurred in the forwarding block") > -1:
             # Then a bad URL was given. Throw an exception.
-            raise ValueError("Invalid address given: " + url)
-
+            msg = "Invalid page address for id %d" % (self.id)
+            raise ValueError(msg)
 
-        while line != '':
-            line = page.readline()
-            line = line.decode('utf-8')
+        lines = iter(psarray)
+        for line in lines:
             if line.find('h2 style=') > -1:
-            	line = page.readline()
-            	line = line.decode('utf-8')
-                name = line.split('</h2>')[0].strip()
+                line = lines.next()
+                self.name = self.unescape(line.split('</h2>')[0].strip())
 
-            if line.find('#006633; margin-left: 0.5em">') > -1:
+            if '#006633; margin-left: 0.5em">' in line:
                 inst_year = line.split('#006633; margin-left: 0.5em">')[1].split("</span>")[:2]
-                institution = inst_year[0].strip()
-                if inst_year[1].strip().isdigit():
-                    year = int(inst_year[1].strip())
+                self.institution = self.unescape(inst_year[0].strip())
+                if self.institution == u"":
+                    self.institution = None
+                if inst_year[1].split(',')[0].strip().isdigit():
+                    self.year = int(inst_year[1].split(',')[0].strip())
 
-            if line.find('Advisor') > -1:
-                if line.find('a href=\"id.php?id=') > -1:
+            if 'Advisor' in line:
+                if 'a href=\"id.php?id=' in line:
                     # Extract link to advisor page.
                     advisor_id = int(line.split('a href=\"id.php?id=')[1].split('\">')[0])
-                    advisors.append(advisor_id)
-                    if not graph.hasNode(advisor_id) and search_list.count(advisor_id) == 0:
-                        search_list.append(advisor_id)
-            elif line.find('Student(s)') > -1 or line.find('No students known') > -1:
-                break
+                    self.advisors.append(advisor_id)
 
-        #    print name.encode('iso-8859-1', 'replace')
-        #    print institution.encode('iso-8859-1', 'replace'), year
-        #    print advisors
-
-        if not graph.hasNode(id):
-            # Add node to graph.
-            graph.addNode(name, institution, year, id, advisors)
+            elif 'Student(s)' in line or 'No students known' in line:
+                break
diff --git a/src/tests.py b/src/tests.py
index 8b66e52..5fa5824 100644
--- a/src/tests.py
+++ b/src/tests.py
@@ -1,5 +1,6 @@
 import unittest
 import GGraph
+import grab
 
 # Unit tests for GGraph.
 class TestRecordMethods(unittest.TestCase):
@@ -261,5 +262,72 @@ class TestGraphMethods(unittest.TestCase):
         dotfile = graph.generateDotFile()
         self.assertEquals(dotfile, dotfileexpt)
 
+class TestGrabberMethods(unittest.TestCase):
+    """
+    Unit tests for the grab.Grabber class.
+    """
+    def setUp(self):
+        self.grabber = grab.Grabber(18231)
+        
+    def test001_init(self):
+        # Test constructor.
+        self.assertEquals(self.grabber.id, 18231)
+        self.assertEquals(self.grabber.pagestr, None)
+        self.assertEquals(self.grabber.name, None)
+        self.assertEquals(self.grabber.institution, None)
+        self.assertEquals(self.grabber.year, None)
+        self.assertEquals(self.grabber.advisors, [])
+
+    def test002_get_page(self):
+        # Test getPage() method.
+        self.grabber.getPage()
+        self.assert_(self.grabber.pagestr is not None)
+        self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
+        # Get page again and test for adverse affects.
+        self.grabber.getPage()
+        self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
+
+    def test003_extract_info_bad(self):
+        # Verify exception thrown for bad id.
+        grabber = grab.Grabber(999999999)
+        self.assertRaises(ValueError, grabber.extractNodeInformation)
+        
+    def test004_extract_info_all_fields(self):
+        # Test the extractNodeInformation() method for a record containing all fields.
+        self.grabber.extractNodeInformation()
+        self.assertEquals(self.grabber.name, u"Carl Friedrich Gau\xdf")
+        self.assertEquals(self.grabber.institution, u"Universit\xe4t Helmstedt")
+        self.assertEquals(self.grabber.year, 1799)
+        self.assertEquals(self.grabber.advisors, [18230])
+        
+    def test005_extract_info_no_advisor(self):
+        # Test the extractNodeInformation() method for a record with no advisor.
+        grabber = grab.Grabber(21235)
+        grabber.extractNodeInformation()
+        self.assertEquals(grabber.name, u"Otto  Mencke")
+        self.assertEquals(grabber.institution, u"Universit\xe4t Leipzig")
+        self.assertEquals(grabber.year, 1665)
+        self.assertEquals(grabber.advisors, [])
+        
+    def test006_extract_info_no_year(self):
+        # Test the extractNodeInformation() method for a record with no year.
+        grabber = grab.Grabber(53658)
+        grabber.extractNodeInformation()
+        self.assertEquals(grabber.name, u"S.  Cingolani")
+        self.assertEquals(grabber.institution, u"Universit\xe0 di Pisa")
+        self.assertEquals(grabber.year, None)
+        self.assertEquals(grabber.advisors, [51261])
+        
+    def test007_extract_info_no_inst(self):
+        # Test the extractNodeInformation() method for a record with no institution.
+        # This test is also missing additional information already tested.
+        grabber = grab.Grabber(52965)
+        grabber.extractNodeInformation()
+        self.assertEquals(grabber.name, u"Walter  Mayer")
+        self.assertEquals(grabber.institution, None)
+        self.assertEquals(grabber.year, None)
+        self.assertEquals(grabber.advisors, [])
+        
+
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/geneagrapher.git