[geneagrapher] 194/226: Generalize grabbing interface.

Doug Torrance dtorrance-guest at moszumanska.debian.org
Sat Jul 11 17:11:08 UTC 2015


This is an automated email from the git hooks/post-receive script.

dtorrance-guest pushed a commit to branch master
in repository geneagrapher.

commit 8d000fd9a283855e12d0a8c54d2402663358612f
Author: David Alber <alber.david at gmail.com>
Date:   Fri Dec 23 15:23:26 2011 -0800

    Generalize grabbing interface.
    
    This change modifies the Geneagrapher class so that it allows a caller
    to provide a custom grabber class. The advantage of this is that it
    allows the introduction of more complex grabbers, such as grabbers that
    provide caching.
    
    To accomplish this, the Grabber class is being modified to work across
    grabbing session (previously a Grabber object was used for retrieving
    the record of a single mathematician), which is necessary for a
    cache-based grabber. Additionally, the Grabber class is now a context
    manager, which allows using it with the Python "with" statement.
    Although, this provides no real advantage for Grabber, it will be
    advantageous for a grabber that opens a resource when initialized and
    closes the resource when it falls out of scope.
    
    This change closes #7.
---
 src/geneagrapher/geneagrapher.py           | 33 ++++++------
 src/geneagrapher/grabber.py                | 50 +++++++++--------
 tests/geneagrapher/test_grabber_methods.py | 86 ++++++++++++------------------
 3 files changed, 78 insertions(+), 91 deletions(-)

diff --git a/src/geneagrapher/geneagrapher.py b/src/geneagrapher/geneagrapher.py
index 8e698d5..c0d69b8 100644
--- a/src/geneagrapher/geneagrapher.py
+++ b/src/geneagrapher/geneagrapher.py
@@ -57,7 +57,7 @@ in graph")
         self.write_filename = options.filename
         self.seed_ids = [int(arg) for arg in args]
 
-    def build_graph_portion(self, grab_queue, is_seed, **kwargs):
+    def build_graph_portion(self, grab_queue, is_seed, grabber, **kwargs):
         """Handle grabbing and storing nodes in the graph. Depending on the
         arguments, this method handles seed nodes, ancestors, or
         descendants."""
@@ -65,11 +65,10 @@ in graph")
             id = grab_queue.popleft()
             if not self.graph.has_node(id):
                 # Then this information has not yet been grabbed.
-                grabber = Grabber(id)
                 if self.verbose:
                     print "Grabbing record #{}".format(id)
                 [name, institution, year, advisors,
-                 descendants] = grabber.extract_node_information()
+                 descendants] = grabber.get_record(id)
                 self.graph.add_node(name, institution, year, id, advisors,
                                     descendants, is_seed)
                 if self.get_ancestors and 'ancestor_queue' in kwargs:
@@ -77,7 +76,7 @@ in graph")
                 if self.get_descendants and 'descendant_queue' in kwargs:
                     kwargs['descendant_queue'].extend(descendants)
 
-    def build_graph(self):
+    def build_graph(self, record_grabber=Grabber, **kwargs):
         """
         Populate the graph member by grabbing the mathematician
         pages and extracting relevant data.
@@ -85,21 +84,21 @@ in graph")
         seed_queue = deque(self.seed_ids)
         ancestor_queue = deque()
         descendant_queue = deque()
+        with record_grabber(**kwargs) as grabber:
+            # Grab "seed" nodes.
+            self.build_graph_portion(seed_queue, True, grabber,
+                                     ancestor_queue=ancestor_queue,
+                                     descendant_queue=descendant_queue)
 
-        # Grab "seed" nodes.
-        self.build_graph_portion(seed_queue, True,
-                                 ancestor_queue=ancestor_queue,
-                                 descendant_queue=descendant_queue)
-
-        # Grab ancestors of seed nodes.
-        if self.get_ancestors:
-            self.build_graph_portion(ancestor_queue, False,
-                                     ancestor_queue=ancestor_queue)
+            # Grab ancestors of seed nodes.
+            if self.get_ancestors:
+                self.build_graph_portion(ancestor_queue, False, grabber,
+                                         ancestor_queue=ancestor_queue)
 
-        # Grab descendants of seed nodes.
-        if self.get_descendants:
-            self.build_graph_portion(descendant_queue, False,
-                                     descendant_queue=descendant_queue)
+            # Grab descendants of seed nodes.
+            if self.get_descendants:
+                self.build_graph_portion(descendant_queue, False, grabber,
+                                         descendant_queue=descendant_queue)
 
     def generate_dot_file(self):
         dotfile = self.graph.generate_dot_file(self.get_ancestors,
diff --git a/src/geneagrapher/grabber.py b/src/geneagrapher/grabber.py
index c055e61..fbaff33 100644
--- a/src/geneagrapher/grabber.py
+++ b/src/geneagrapher/grabber.py
@@ -8,26 +8,22 @@ class Grabber:
     Class for grabbing and parsing mathematician information from
     Math Genealogy Database.
     """
-    def __init__(self, id):
-        self.id = id
-        self.name = None
-        self.institution = None
-        self.year = None
-        self.advisors = set([])
-        self.descendants = set([])
+    def __init__(self, **kwargs):
+        pass
 
-    @staticmethod
-    def extract_id(tag):
-        """Extract the ID from a tag with form <a href="id.php?id=7401">."""
-        return int(tag.attrs[0][-1].split('=')[-1])
+    def __enter__(self):
+        return self
 
-    def extract_node_information(self):
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+    def get_record(self, id):
         """
         For the mathematician in this object, extract the list of
         advisor ids, the mathematician name, the mathematician
         institution, and the year of the mathematician's degree.
         """
-        url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(self.id)
+        url = 'http://genealogy.math.ndsu.nodak.edu/id.php?id=' + str(id)
         page = urllib.urlopen(url)
         soup = BeautifulSoup(page, convertEntities='html')
         page.close()
@@ -36,33 +32,41 @@ class Grabber:
 exist in the database. Please back up and try again.":
             # Then a bad URL (e.g., a bad record id) was given. Throw an
             # exception.
-            msg = "Invalid id {}".format(self.id)
+            msg = "Invalid id {}".format(id)
             raise ValueError(msg)
 
         # Get mathematician name.
-        self.name = soup.find('h2').getText()
+        name = soup.find('h2').getText()
 
         # Get institution name (or None, if it there is no institution name).
-        self.institution = soup.find('div', style="line-height: 30px; \
+        institution = soup.find('div', style="line-height: 30px; \
 text-align: center; margin-bottom: 1ex").find('span').find('span').text
-        if self.institution == u'':
-            self.institution = None
+        if institution == u'':
+            institution = None
 
         # Get graduation year, if present.
         inst_year = soup.find('div', style="line-height: 30px; text-align: \
 center; margin-bottom: 1ex").find('span').contents[-1].strip()
         if inst_year.isdigit():
-            self.year = int(inst_year)
+            year = int(inst_year)
+        else:
+            year = None
 
         # Get advisor IDs.
-        self.advisors = set([self.extract_id(info.findNext()) for info in
+        advisors = set([extract_id(info.findNext()) for info in
                              soup.findAll(text=re.compile('Advisor'))
                              if 'Advisor: Unknown' not in info])
 
         # Get descendant IDs.
         if soup.find('table') is not None:
-            self.descendants = set([self.extract_id(info) for info in
+            descendants = set([extract_id(info) for info in
                                     soup.find('table').findAll('a')])
+        else:
+            descendants = set([])
+
+        return [name, institution, year, advisors, descendants]
+
 
-        return [self.name, self.institution, self.year, self.advisors,
-                self.descendants]
+def extract_id(tag):
+    """Extract the ID from a tag with form <a href="id.php?id=7401">."""
+    return int(tag.attrs[0][-1].split('=')[-1])
diff --git a/tests/geneagrapher/test_grabber_methods.py b/tests/geneagrapher/test_grabber_methods.py
index 48d0d6f..2c6a967 100644
--- a/tests/geneagrapher/test_grabber_methods.py
+++ b/tests/geneagrapher/test_grabber_methods.py
@@ -5,38 +5,28 @@ from geneagrapher.grabber import Grabber
 class TestGrabberMethods(unittest.TestCase):
     """Unit tests for the Grabber class."""
     def setUp(self):
-        self.grabber = Grabber(18231)
+        self.grabber = Grabber()
 
     def test_init(self):
         """Test constructor."""
-        self.assertEqual(self.grabber.id, 18231)
-        self.assertEqual(self.grabber.name, None)
-        self.assertEqual(self.grabber.institution, None)
-        self.assertEqual(self.grabber.year, None)
-        self.assertEqual(self.grabber.advisors, set([]))
-        self.assertEqual(self.grabber.descendants, set([]))
+        self.assertIsInstance(self.grabber, Grabber)
 
-    def test_extract_info_bad(self):
+    def test_get_record_bad(self):
         """Verify exception thrown for bad id."""
-        grabber = Grabber(999999999)
-        self.assertRaises(ValueError, grabber.extract_node_information)
+        grabber = Grabber()
+        self.assertRaises(ValueError, grabber.get_record, 999999999)
 
         try:
-            grabber.extract_node_information()
+            grabber.get_record(999999999)
         except ValueError as e:
             self.assertEqual(str(e), "Invalid id 999999999")
         else:
             self.fail()
 
-    def test_extract_info_all_fields(self):
-        """Test the extract_node_information() method for a record containing
-        all fields."""
+    def test_get_record_all_fields(self):
+        """Test the get_record() method for a record containing all fields."""
         [name, institution, year, advisors,
-         descendents] = self.grabber.extract_node_information()
-        self.assertEqual(name, self.grabber.name)
-        self.assertEqual(institution, self.grabber.institution)
-        self.assertEqual(year, self.grabber.year)
-        self.assertEqual(advisors, self.grabber.advisors)
+         descendents] = self.grabber.get_record(18231)
         self.assertEqual(name, u"Carl Friedrich Gau\xdf")
         self.assertEqual(institution, u"Universit\xe4t Helmstedt")
         self.assertEqual(year, 1799)
@@ -44,10 +34,9 @@ class TestGrabberMethods(unittest.TestCase):
         self.assertEqual(descendents, set([18603, 18233, 62547, 29642, 55175,
                                            29458, 19953, 18232, 151876]))
 
-        # Verify calling extract_node_information() twice does not have side
-        # effect.
+        # Verify calling get_record() twice does not have side effect.
         [name, institution, year, advisors,
-         descendents] = self.grabber.extract_node_information()
+         descendents] = self.grabber.get_record(18231)
         self.assertEqual(name, u"Carl Friedrich Gau\xdf")
         self.assertEqual(institution, u"Universit\xe4t Helmstedt")
         self.assertEqual(year, 1799)
@@ -55,52 +44,47 @@ class TestGrabberMethods(unittest.TestCase):
         self.assertEqual(descendents, set([18603, 18233, 62547, 29642, 55175,
                                            29458, 19953, 18232, 151876]))
 
-    def test_extract_info_no_advisor(self):
-        """Test the extract_node_information() method for a record with no
-        advisor."""
-        grabber = Grabber(137717)
+    def test_get_record_no_advisor(self):
+        """Test the get_record() method for a record with no advisor."""
+        grabber = Grabber()
         [name, institution, year, advisors,
-         descendents] = grabber.extract_node_information()
+         descendents] = grabber.get_record(137717)
         self.assertEqual(name, u"Valentin  Alberti")
         self.assertEqual(institution, u"Universit\xe4t Leipzig")
         self.assertEqual(year, 1678)
         self.assertEqual(advisors, set([]))
         self.assertEqual(descendents, set([127946]))
 
-    def test_extract_info_no_descendants(self):
-        """Test the extract_node_information() method for a record with no
-        descendants."""
-        # This is currently identical to the extract_info_no_year test.
-        grabber = Grabber(53658)
+    def test_get_record_no_descendants(self):
+        """Test the get_record() method for a record with no descendants."""
+        # This is currently identical to the get_record_no_year test.
+        grabber = Grabber()
         [name, institution, year, advisors,
-         descendents] = grabber.extract_node_information()
+         descendents] = grabber.get_record(53658)
         self.assertEqual(name, u"S.  Cingolani")
         self.assertEqual(institution, u"Scuola Normale Superiore di Pisa")
         self.assertEqual(year, None)
         self.assertEqual(advisors, set([51261]))
         self.assertEqual(descendents, set([]))
 
-    def test_extract_info_no_year(self):
-        """
-        Test the extract_node_information() method for a record with no year.
-        """
+    def test_get_record_no_year(self):
+        """Test the get_record() method for a record with no year."""
         # This example also has no descendents.
-        grabber = Grabber(53658)
+        grabber = Grabber()
         [name, institution, year, advisors,
-         descendents] = grabber.extract_node_information()
+         descendents] = grabber.get_record(53658)
         self.assertEqual(name, u"S.  Cingolani")
         self.assertEqual(institution, u"Scuola Normale Superiore di Pisa")
         self.assertEqual(year, None)
         self.assertEqual(advisors, set([51261]))
         self.assertEqual(descendents, set([]))
 
-    def test_extract_info_no_inst(self):
-        """Test the extract_node_information() method for a record with no
-        institution."""
+    def test_get_record_no_inst(self):
+        """Test the get_record() method for a record with no institution."""
         # This test is also missing additional information already tested.
-        grabber = Grabber(52965)
+        grabber = Grabber()
         [name, institution, year, advisors,
-         descendents] = grabber.extract_node_information()
+         descendents] = grabber.get_record(52965)
         self.assertEqual(name, u"Walter  Mayer")
         self.assertEqual(institution, None)
         self.assertEqual(year, None)
@@ -109,12 +93,12 @@ class TestGrabberMethods(unittest.TestCase):
 
     # Tests for special (from my point of view) characters:
     def test_slash_l(self):
-        """Test the extract_node_information() method for a record
-        # containing a slash l character. Example:
-        # http://www.genealogy.math.ndsu.nodak.edu/id.php?id=7383."""
-        grabber = Grabber(7383)
+        """Test the get_record() method for a record containing a slash l
+        character. Example:
+        http://www.genealogy.math.ndsu.nodak.edu/id.php?id=7383."""
+        grabber = Grabber()
         [name, institution, year, advisors,
-         descendents] = grabber.extract_node_information()
+         descendents] = grabber.get_record(7383)
         self.assertEqual(name, u"W\u0142adys\u0142aw Hugo Dyonizy Steinhaus")
         self.assertEqual(institution,
                          u"Georg-August-Universit\xe4t G\xf6ttingen")
@@ -126,9 +110,9 @@ class TestGrabberMethods(unittest.TestCase):
 
     def test_multiple_advisors(self):
         """Test for multiple advisors."""
-        grabber = Grabber(19964)
+        grabber = Grabber()
         [name, institution, year, advisors,
-         descendents] = grabber.extract_node_information()
+         descendents] = grabber.get_record(19964)
         self.assertEqual(name, u"Rudolf Otto Sigismund Lipschitz")
         self.assertEqual(institution, u"Universit\xe4t Berlin")
         self.assertEqual(year, 1853)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/geneagrapher.git



More information about the debian-science-commits mailing list