[geneagrapher] 06/226: * Modified grab.py to implement a class called Grabber that behaves similarly to previous implementation, in most ways. Some changes to functionality were made. * Generated unit tests for the Grabber class. This resolves ticket #2.
Doug Torrance
dtorrance-guest at moszumanska.debian.org
Sat Jul 11 17:10:30 UTC 2015
This is an automated email from the git hooks/post-receive script.
dtorrance-guest pushed a commit to branch master
in repository geneagrapher.
commit e0f5eab2cd2ec45e2332472e753500a6284b23be
Author: David Alber <alber.david at gmail.com>
Date: Wed Apr 9 06:07:18 2008 +0000
* Modified grab.py to implement a class called Grabber that behaves similarly to previous implementation, in most ways. Some changes to functionality were made.
* Generated unit tests for the Grabber class. This resolves ticket #2.
---
src/grab.py | 98 ++++++++++++++++++++++++++++++++++--------------------------
src/tests.py | 68 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 124 insertions(+), 42 deletions(-)
diff --git a/src/grab.py b/src/grab.py
index 3d0310f..d9fd0c3 100644
--- a/src/grab.py
+++ b/src/grab.py
@@ -1,57 +1,71 @@
import urllib
+import re
+from htmlentitydefs import name2codepoint
from GGraph import *
-#id = 7401
+class Grabber:
+ """
+ Class for grabbing and parsing mathematician information from
+ Math Genealogy Database.
+ """
+ def __init__(self, id):
+ self.id = id
+ self.pagestr = None
+ self.name = None
+ self.institution = None
+ self.year = None
+ self.advisors = []
-def extractNodeInformation(id, graph):
- search_list = [id]
+ def unescape(self, s):
+ return re.sub('&(%s);' % '|'.join(name2codepoint),\
+ lambda m: unichr(name2codepoint[m.group(1)]), s)
- while len(search_list) > 0:
- id = search_list.pop()
- #url = 'http://genealogy.math.ndsu.nodak.edu/html/id.phtml?id=' + str(id)
- url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(id)
- #url = 'http://www.genealogy.ams.org/html/id.phtml?id=' + str(id)
- page = urllib.urlopen(url)
+ def getPage(self):
+ """
+ Grab the page for self.id from the Math Genealogy Database.
+ """
+ if self.pagestr is None:
+ url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
+ page = urllib.urlopen(url)
+ self.pagestr = page.read()
+ self.pagestr = self.pagestr.decode('utf-8')
+
+ def extractNodeInformation(self):
+ """
+ For the mathematician in this object, extract the list of
+ advisor ids, the mathematician name, the mathematician
+ institution, and the year of the mathematician's degree.
+ """
+ if self.pagestr is None:
+ self.getPage()
- advisors = []
- name = ''
- institution = ''
- year = -1
-
- line = page.readline()
- if line.find("<html>An error occurred in the forwarding block") > -1:
+ # Split the page string at newline characters.
+ psarray = self.pagestr.split('\n')
+
+ if psarray[0].find("An error occurred in the forwarding block") > -1:
# Then a bad URL was given. Throw an exception.
- raise ValueError("Invalid address given: " + url)
-
+ msg = "Invalid page address for id %d" % (self.id)
+ raise ValueError(msg)
- while line != '':
- line = page.readline()
- line = line.decode('utf-8')
+ lines = iter(psarray)
+ for line in lines:
if line.find('h2 style=') > -1:
- line = page.readline()
- line = line.decode('utf-8')
- name = line.split('</h2>')[0].strip()
+ line = lines.next()
+ self.name = self.unescape(line.split('</h2>')[0].strip())
- if line.find('#006633; margin-left: 0.5em">') > -1:
+ if '#006633; margin-left: 0.5em">' in line:
inst_year = line.split('#006633; margin-left: 0.5em">')[1].split("</span>")[:2]
- institution = inst_year[0].strip()
- if inst_year[1].strip().isdigit():
- year = int(inst_year[1].strip())
+ self.institution = self.unescape(inst_year[0].strip())
+ if self.institution == u"":
+ self.institution = None
+ if inst_year[1].split(',')[0].strip().isdigit():
+ self.year = int(inst_year[1].split(',')[0].strip())
- if line.find('Advisor') > -1:
- if line.find('a href=\"id.php?id=') > -1:
+ if 'Advisor' in line:
+ if 'a href=\"id.php?id=' in line:
# Extract link to advisor page.
advisor_id = int(line.split('a href=\"id.php?id=')[1].split('\">')[0])
- advisors.append(advisor_id)
- if not graph.hasNode(advisor_id) and search_list.count(advisor_id) == 0:
- search_list.append(advisor_id)
- elif line.find('Student(s)') > -1 or line.find('No students known') > -1:
- break
+ self.advisors.append(advisor_id)
- # print name.encode('iso-8859-1', 'replace')
- # print institution.encode('iso-8859-1', 'replace'), year
- # print advisors
-
- if not graph.hasNode(id):
- # Add node to graph.
- graph.addNode(name, institution, year, id, advisors)
+ elif 'Student(s)' in line or 'No students known' in line:
+ break
diff --git a/src/tests.py b/src/tests.py
index 8b66e52..5fa5824 100644
--- a/src/tests.py
+++ b/src/tests.py
@@ -1,5 +1,6 @@
import unittest
import GGraph
+import grab
# Unit tests for GGraph.
class TestRecordMethods(unittest.TestCase):
@@ -261,5 +262,72 @@ class TestGraphMethods(unittest.TestCase):
dotfile = graph.generateDotFile()
self.assertEquals(dotfile, dotfileexpt)
+class TestGrabberMethods(unittest.TestCase):
+ """
+ Unit tests for the grab.Grabber class.
+ """
+ def setUp(self):
+ self.grabber = grab.Grabber(18231)
+
+ def test001_init(self):
+ # Test constructor.
+ self.assertEquals(self.grabber.id, 18231)
+ self.assertEquals(self.grabber.pagestr, None)
+ self.assertEquals(self.grabber.name, None)
+ self.assertEquals(self.grabber.institution, None)
+ self.assertEquals(self.grabber.year, None)
+ self.assertEquals(self.grabber.advisors, [])
+
+ def test002_get_page(self):
+ # Test getPage() method.
+ self.grabber.getPage()
+ self.assert_(self.grabber.pagestr is not None)
+ self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
+ # Get page again and test for adverse affects.
+ self.grabber.getPage()
+ self.assert_(u"<title>The Mathematics Genealogy Project - Carl Gau\xdf</title>" in self.grabber.pagestr)
+
+ def test003_extract_info_bad(self):
+ # Verify exception thrown for bad id.
+ grabber = grab.Grabber(999999999)
+ self.assertRaises(ValueError, grabber.extractNodeInformation)
+
+ def test004_extract_info_all_fields(self):
+ # Test the extractNodeInformation() method for a record containing all fields.
+ self.grabber.extractNodeInformation()
+ self.assertEquals(self.grabber.name, u"Carl Friedrich Gau\xdf")
+ self.assertEquals(self.grabber.institution, u"Universit\xe4t Helmstedt")
+ self.assertEquals(self.grabber.year, 1799)
+ self.assertEquals(self.grabber.advisors, [18230])
+
+ def test005_extract_info_no_advisor(self):
+ # Test the extractNodeInformation() method for a record with no advisor.
+ grabber = grab.Grabber(21235)
+ grabber.extractNodeInformation()
+ self.assertEquals(grabber.name, u"Otto Mencke")
+ self.assertEquals(grabber.institution, u"Universit\xe4t Leipzig")
+ self.assertEquals(grabber.year, 1665)
+ self.assertEquals(grabber.advisors, [])
+
+ def test006_extract_info_no_year(self):
+ # Test the extractNodeInformation() method for a record with no year.
+ grabber = grab.Grabber(53658)
+ grabber.extractNodeInformation()
+ self.assertEquals(grabber.name, u"S. Cingolani")
+ self.assertEquals(grabber.institution, u"Universit\xe0 di Pisa")
+ self.assertEquals(grabber.year, None)
+ self.assertEquals(grabber.advisors, [51261])
+
+ def test007_extract_info_no_inst(self):
+ # Test the extractNodeInformation() method for a record with no institution.
+ # This test is also missing additional information already tested.
+ grabber = grab.Grabber(52965)
+ grabber.extractNodeInformation()
+ self.assertEquals(grabber.name, u"Walter Mayer")
+ self.assertEquals(grabber.institution, None)
+ self.assertEquals(grabber.year, None)
+ self.assertEquals(grabber.advisors, [])
+
+
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/geneagrapher.git
More information about the debian-science-commits
mailing list