[Debconf-video-commits] r335 - package/branches/pycon09/src

Thu Mar 19 01:03:31 UTC 2009

Author: benh
Date: 2009-03-19 01:03:31 +0000 (Thu, 19 Mar 2009)
New Revision: 335

Added:
   package/branches/pycon09/src/pycon-import-events
Removed:
   package/branches/pycon09/src/pycon-import-talks
Log:
Import tutorials too.  By scraping the web site.  Yuck.


Copied: package/branches/pycon09/src/pycon-import-events (from rev 334, package/branches/pycon09/src/pycon-import-talks)
===================================================================

--- package/branches/pycon09/src/pycon-import-events	                        (rev 0)
+++ package/branches/pycon09/src/pycon-import-events	2009-03-19 01:03:31 UTC (rev 335)
@@ -0,0 +1,200 @@
+#!/usr/bin/python
+
+# Import events from public schedules
+
+import datetime
+import itertools
+import os
+import pgdb
+import pytz
+import re
+import sys
+import urllib2
+import urlparse
+import vobject
+
+TALKS_SOURCE_URL = 'http://us.pycon.org/2009/conference/schedule/ical/'
+TUTORIALS_SOURCE_URL = 'http://us.pycon.org/2009/tutorials/schedule/'
+
+_connection = None
+
+def get_cursor():
+    global _connection
+    if not _connection:
+        _connection = pgdb.connect(dsn=config['DATABASE_DSN'],
+                                   user=config.get('DATABASE_USER'),
+                                   password=config.get('DATABASE_PASSWORD'))
+    return _connection.cursor()
+
+def get_events_vobject(url):
+    data = urllib2.urlopen(url).read()
+    for node in vobject.readOne(data).getChildren():
+        if node.name == 'VEVENT':
+            yield dict((attr.name, attr.value) for attr in node.getChildren())
+
+def get_talks():
+    local_tz = pytz.timezone('America/Chicago')
+
+    for vevent in get_events_vobject(TALKS_SOURCE_URL):
+        event = {'conference_id': 1}
+
+        # Drop id number found in some summaries.
+        event['title'] = re.sub(r' \(#\d+\)$', '', vevent['SUMMARY'])
+
+        # Drop metadata at top of description, but hold onto the room name.
+        # Drop events which don't have this metadata, as they are such
+        # exciting talks as 'Lunch'.
+        match = re.match(r'Room: (?!None\n)([^\n]*)\n'
+                         r'Presenters: [^\n]*\n'
+                         r'\d+min [^\n]*\n'
+                         r'(?:categories: [^\n]*\n)?'
+                         r'(.*)',
+                         vevent['DESCRIPTION'])
+        if not match:
+            print 'INFO: Ignoring', event['title']
+            continue
+        event['conference_room'] = match.group(1)
+        event['description'] = match.group(2)
+
+        event['source_url'] = TALKS_SOURCE_URL + '#' + vevent['UID']
+
+        event['start_time'] = vevent['DTSTART'].astimezone(local_tz).isoformat(' ')
+        event['duration'] = '%d SECONDS' % (vevent['DTEND'] - vevent['DTSTART']).seconds
+
+        yield event
+
+### HTML-to-text pasted from another project of mine - bwh
+
+_entity_re = re.compile(ur'&(#\d+|\w+);?')
+# TODO: fill this out
+_entity_map = {
+    u'amp': u'&',
+    u'gt': u'>',
+    u'lt': u'<',
+    u'nbsp': u'\u00A0',
+    u'pound': u'\u00A3',
+    u'quot': u'"'
+    }
+
+def _entity_replace(match):
+    name = match.group(1)
+    if name[0] == u'#':
+        code_point = int(name[1:])
+        if code_point < 128 or code_point >= 160:
+            return unichr(code_point)
+        else: # Assume broken Windows software.
+            return unicode(chr(code_point), 'windows-1252')
+    else:
+        return _entity_map.get(name, u'?')
+
+# An extremely liberal tag regex; it should catch all HTML start and
+# end tags, XML empty tags, and the bogus PIs found in some
+# MS-Orifice-generated HTML.
+_tag_re = re.compile(ur'<([/?]?[\w:]+)(?:\s[^>]*)?/?>')
+
+_tag_map = {
+    u'br': u'\n',
+    u'p': u'\n\n',
+    u'table': u'\n\n',
+    u'tr': u'\n'
+    }
+
+def _tag_replace(match):
+    return _tag_map.get(match.group(1).lower(), u'')
+
+_space_re = re.compile(ur'[\t\n\r ]+')
+_excess_vertical_space_re = re.compile(ur'(?:[\u00A0 ]*\n){3,}')
+
+def html_to_text(html, encoding):
+    '''Convert HTML in given encoding to plain Unicode text.'''
+    text = _entity_re.sub(_entity_replace,
+                          _tag_re.sub(_tag_replace,
+                                      _space_re.sub(u' ',
+                                                    unicode(html, encoding))))
+    return _excess_vertical_space_re.sub(u'\n\n', text).strip()
+
+def get_tutorials():
+    data = urllib2.urlopen(TUTORIALS_SOURCE_URL).read()
+
+    for match in re.finditer(r'<a class="reference external" href="([12][AP]M\d+)">', data):
+        source_id = match.group(1)
+
+        event = {
+            'conference_id': 1,
+            'source_url': urlparse.urljoin(TUTORIALS_SOURCE_URL, source_id)
+            }
+
+        event['start_time'] = '2009-03-%d' % (24 + int(source_id[:1]))
+        if source_id[1:3] == 'AM':
+            event['start_time'] = event['start_time'] + ' 09:00:00'
+        else:
+            event['start_time'] = event['start_time'] + ' 13:20:00'
+        event['duration'] = '200 MINUTES'
+        event['conference_room'] = 'Tutorial ' + source_id[3:]  # FIXME
+
+        event_desc = urllib2.urlopen(event['source_url']).read()
+        match = re.search(r'<h1 class="title">(.*?)</h1>\s*'
+                          r'(.*?)<div class="section"',
+                          event_desc,
+                          re.DOTALL)
+        if not match:
+            print 'WARN: Could not parse description on', event['source_url']
+            continue
+        event['title'] = html_to_text(match.group(1), 'utf-8')
+        event['description'] = html_to_text(match.group(2), 'utf-8')
+
+        yield event
+
+def main(pretend=False):
+    cur = get_cursor()
+
+    for event in itertools.chain(get_talks(), get_tutorials()):
+        # Insert room if necessary
+        cur.execute('SELECT COUNT(*) FROM conference_room'
+                    ' WHERE conference_room=%(conference_room)s',
+                    event)
+        if not pretend and not cur.fetchone()[0]:
+            cur.execute('INSERT INTO conference_room(conference_id, conference_room)'
+                        ' VALUES (%(conference_id)d, %(conference_room)s)',
+                        event)
+
+        # Insert/update event
+        cur.execute('SELECT event_id FROM event WHERE source_url=%(source_url)s',
+                    event)
+        if pretend:
+            pass
+        row = cur.fetchone()
+        if row:
+            event['event_id'] = row[0]
+            cur.execute('UPDATE event SET'
+                        ' source_url=%(source_url)s,'
+                        ' conference_room=%(conference_room)s,'
+                        ' title=%(title)s,'
+                        ' description=%(description)s,'
+                        ' start_time=%(start_time)s,'
+                        ' duration=%(duration)s'
+                        ' WHERE event_id=%(event_id)d',
+                        event)
+        else:
+            cur.execute('INSERT INTO event(source_url, conference_id,'
+                        '                  conference_room, title, description,'
+                        '                  start_time, duration)'
+                        ' VALUES(%(source_url)s, %(conference_id)d,'
+                        '        %(conference_room)s, %(title)s,'
+                        '        %(description)s, '
+                        '        %(start_time)s,'
+                        '        %(duration)s)',
+                        event)
+
+        cur.execute('COMMIT')
+        cur.execute('BEGIN')
+
+if __name__ == '__main__':
+    try:
+        sys.path.insert(0, '/usr/share/debconf-video-store')
+        import shellconfig
+        config = shellconfig.read_file('/etc/default/debconf-video')
+        main(pretend=('--dry-run' in sys.argv[1:]))
+    except Exception, e:
+        print >>sys.stderr, 'ERROR:', e.__class__.__name__, e
+        sys.exit(2)


Property changes on: package/branches/pycon09/src/pycon-import-events
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mergeinfo
   + 

Deleted: package/branches/pycon09/src/pycon-import-talks
===================================================================
--- package/branches/pycon09/src/pycon-import-talks	2009-03-19 00:23:14 UTC (rev 334)
+++ package/branches/pycon09/src/pycon-import-talks	2009-03-19 01:03:31 UTC (rev 335)
@@ -1,114 +0,0 @@
-#!/usr/bin/python
-
-# Import talks from public schedule in ICS format
-
-import os
-import pgdb
-import pytz
-import re
-import sys
-import urllib2
-import vobject
-
-SOURCE_URL = 'http://us.pycon.org/2009/conference/schedule/ical/'
-
-_connection = None
-
-def get_cursor():
-    global _connection
-    if not _connection:
-        _connection = pgdb.connect(dsn=config['DATABASE_DSN'],
-                                   user=config.get('DATABASE_USER'),
-                                   password=config.get('DATABASE_PASSWORD'))
-    return _connection.cursor()
-
-def get_events_vobject(url):
-    data = urllib2.urlopen(url).read()
-    for node in vobject.readOne(data).getChildren():
-        if node.name == 'VEVENT':
-            yield dict((attr.name, attr.value) for attr in node.getChildren())
-
-def get_events():
-    local_tz = pytz.timezone('America/Chicago')
-
-    for vevent in get_events_vobject(SOURCE_URL):
-        event = {'conference_id': 1}
-
-        # Drop id number found in some summaries.
-        event['title'] = re.sub(r' \(#\d+\)$', '', vevent['SUMMARY'])
-
-        # Drop metadata at top of description, but hold onto the room name.
-        # Drop events which don't have this metadata, as they are such
-        # exciting talks as 'Lunch'.
-        match = re.match(r'Room: (?!None\n)([^\n]*)\n'
-                         r'Presenters: [^\n]*\n'
-                         r'\d+min [^\n]*\n'
-                         r'(?:categories: [^\n]*\n)?'
-                         r'(.*)',
-                         vevent['DESCRIPTION'])
-        if not match:
-            print 'INFO: Ignoring', event['title']
-            continue
-        event['conference_room'] = match.group(1)
-        event['description'] = match.group(2)
-
-        event['source_url'] = SOURCE_URL + '#' + vevent['UID']
-
-        event['start_time'] = vevent['DTSTART'].astimezone(local_tz).isoformat(' ')
-        event['duration'] = '%d SECONDS' % (vevent['DTEND'] - vevent['DTSTART']).seconds
-
-        yield event
-
-def main(pretend=False):
-    cur = get_cursor()
-
-    for event in get_events():
-        # Insert room if necessary
-        cur.execute('SELECT COUNT(*) FROM conference_room'
-                    ' WHERE conference_room=%(conference_room)s',
-                    event)
-        if not pretend and not cur.fetchone()[0]:
-            cur.execute('INSERT INTO conference_room(conference_id, conference_room)'
-                        ' VALUES (%(conference_id)d, %(conference_room)s)',
-                        event)
-
-        # Insert/update event
-        cur.execute('SELECT event_id FROM event WHERE source_url=%(source_url)s',
-                    event)
-        if pretend:
-            pass
-        row = cur.fetchone()
-        if row:
-            event['event_id'] = row[0]
-            cur.execute('UPDATE event SET'
-                        ' source_url=%(source_url)s,'
-                        ' conference_room=%(conference_room)s,'
-                        ' title=%(title)s,'
-                        ' description=%(description)s,'
-                        ' start_time=%(start_time)s,'
-                        ' duration=%(duration)s'
-                        ' WHERE event_id=%(event_id)d',
-                        event)
-        else:
-            cur.execute('INSERT INTO event(source_url, conference_id,'
-                        '                  conference_room, title, description,'
-                        '                  start_time, duration)'
-                        ' VALUES(%(source_url)s, %(conference_id)d,'
-                        '        %(conference_room)s, %(title)s,'
-                        '        %(description)s, '
-                        '        %(start_time)s,'
-                        '        %(duration)s)',
-                        event)
-
-        cur.execute('COMMIT')
-        cur.execute('BEGIN')
-
-if __name__ == '__main__':
-    try:
-        sys.path.insert(0, '/usr/share/debconf-video-store')
-        import shellconfig
-        config = shellconfig.read_file('/etc/default/debconf-video')
-        main(pretend=('--dry-run' in sys.argv[1:]))
-    except Exception, e:
-        print >>sys.stderr, 'ERROR:', e.__class__.__name__, e
-        sys.exit(2)