[Debconf-video-commits] r335 - package/branches/pycon09/src
benh at alioth.debian.org
benh at alioth.debian.org
Thu Mar 19 01:03:31 UTC 2009
Author: benh
Date: 2009-03-19 01:03:31 +0000 (Thu, 19 Mar 2009)
New Revision: 335
Added:
package/branches/pycon09/src/pycon-import-events
Removed:
package/branches/pycon09/src/pycon-import-talks
Log:
Import tutorials too. By scraping the web site. Yuck.
Copied: package/branches/pycon09/src/pycon-import-events (from rev 334, package/branches/pycon09/src/pycon-import-talks)
===================================================================
--- package/branches/pycon09/src/pycon-import-events (rev 0)
+++ package/branches/pycon09/src/pycon-import-events 2009-03-19 01:03:31 UTC (rev 335)
@@ -0,0 +1,200 @@
+#!/usr/bin/python
+
+# Import events from public schedules
+
+import datetime
+import itertools
+import os
+import pgdb
+import pytz
+import re
+import sys
+import urllib2
+import urlparse
+import vobject
+
+TALKS_SOURCE_URL = 'http://us.pycon.org/2009/conference/schedule/ical/'
+TUTORIALS_SOURCE_URL = 'http://us.pycon.org/2009/tutorials/schedule/'
+
+_connection = None
+
+def get_cursor():
+ global _connection
+ if not _connection:
+ _connection = pgdb.connect(dsn=config['DATABASE_DSN'],
+ user=config.get('DATABASE_USER'),
+ password=config.get('DATABASE_PASSWORD'))
+ return _connection.cursor()
+
+def get_events_vobject(url):
+ data = urllib2.urlopen(url).read()
+ for node in vobject.readOne(data).getChildren():
+ if node.name == 'VEVENT':
+ yield dict((attr.name, attr.value) for attr in node.getChildren())
+
+def get_talks():
+ local_tz = pytz.timezone('America/Chicago')
+
+ for vevent in get_events_vobject(TALKS_SOURCE_URL):
+ event = {'conference_id': 1}
+
+ # Drop id number found in some summaries.
+ event['title'] = re.sub(r' \(#\d+\)$', '', vevent['SUMMARY'])
+
+ # Drop metadata at top of description, but hold onto the room name.
+ # Drop events which don't have this metadata, as they are such
+ # exciting talks as 'Lunch'.
+ match = re.match(r'Room: (?!None\n)([^\n]*)\n'
+ r'Presenters: [^\n]*\n'
+ r'\d+min [^\n]*\n'
+ r'(?:categories: [^\n]*\n)?'
+ r'(.*)',
+ vevent['DESCRIPTION'])
+ if not match:
+ print 'INFO: Ignoring', event['title']
+ continue
+ event['conference_room'] = match.group(1)
+ event['description'] = match.group(2)
+
+ event['source_url'] = TALKS_SOURCE_URL + '#' + vevent['UID']
+
+ event['start_time'] = vevent['DTSTART'].astimezone(local_tz).isoformat(' ')
+ event['duration'] = '%d SECONDS' % (vevent['DTEND'] - vevent['DTSTART']).seconds
+
+ yield event
+
+### HTML-to-text pasted from another project of mine - bwh
+
+_entity_re = re.compile(ur'&(#\d+|\w+);?')
+# TODO: fill this out
+_entity_map = {
+ u'amp': u'&',
+ u'gt': u'>',
+ u'lt': u'<',
+ u'nbsp': u'\u00A0',
+ u'pound': u'\u00A3',
+ u'quot': u'"'
+ }
+
+def _entity_replace(match):
+ name = match.group(1)
+ if name[0] == u'#':
+ code_point = int(name[1:])
+ if code_point < 128 or code_point >= 160:
+ return unichr(code_point)
+ else: # Assume broken Windows software.
+ return unicode(chr(code_point), 'windows-1252')
+ else:
+ return _entity_map.get(name, u'?')
+
+# An extremely liberal tag regex; it should catch all HTML start and
+# end tags, XML empty tags, and the bogus PIs found in some
+# MS-Orifice-generated HTML.
+_tag_re = re.compile(ur'<([/?]?[\w:]+)(?:\s[^>]*)?/?>')
+
+_tag_map = {
+ u'br': u'\n',
+ u'p': u'\n\n',
+ u'table': u'\n\n',
+ u'tr': u'\n'
+ }
+
+def _tag_replace(match):
+ return _tag_map.get(match.group(1).lower(), u'')
+
+_space_re = re.compile(ur'[\t\n\r ]+')
+_excess_vertical_space_re = re.compile(ur'(?:[\u00A0 ]*\n){3,}')
+
+def html_to_text(html, encoding):
+ '''Convert HTML in given encoding to plain Unicode text.'''
+ text = _entity_re.sub(_entity_replace,
+ _tag_re.sub(_tag_replace,
+ _space_re.sub(u' ',
+ unicode(html, encoding))))
+ return _excess_vertical_space_re.sub(u'\n\n', text).strip()
+
+def get_tutorials():
+ data = urllib2.urlopen(TUTORIALS_SOURCE_URL).read()
+
+ for match in re.finditer(r'<a class="reference external" href="([12][AP]M\d+)">', data):
+ source_id = match.group(1)
+
+ event = {
+ 'conference_id': 1,
+ 'source_url': urlparse.urljoin(TUTORIALS_SOURCE_URL, source_id)
+ }
+
+ event['start_time'] = '2009-03-%d' % (24 + int(source_id[:1]))
+ if source_id[1:3] == 'AM':
+ event['start_time'] = event['start_time'] + ' 09:00:00'
+ else:
+ event['start_time'] = event['start_time'] + ' 13:20:00'
+ event['duration'] = '200 MINUTES'
+ event['conference_room'] = 'Tutorial ' + source_id[3:] # FIXME
+
+ event_desc = urllib2.urlopen(event['source_url']).read()
+ match = re.search(r'<h1 class="title">(.*?)</h1>\s*'
+ r'(.*?)<div class="section"',
+ event_desc,
+ re.DOTALL)
+ if not match:
+ print 'WARN: Could not parse description on', event['source_url']
+ continue
+ event['title'] = html_to_text(match.group(1), 'utf-8')
+ event['description'] = html_to_text(match.group(2), 'utf-8')
+
+ yield event
+
+def main(pretend=False):
+ cur = get_cursor()
+
+ for event in itertools.chain(get_talks(), get_tutorials()):
+ # Insert room if necessary
+ cur.execute('SELECT COUNT(*) FROM conference_room'
+ ' WHERE conference_room=%(conference_room)s',
+ event)
+ if not pretend and not cur.fetchone()[0]:
+ cur.execute('INSERT INTO conference_room(conference_id, conference_room)'
+ ' VALUES (%(conference_id)d, %(conference_room)s)',
+ event)
+
+ # Insert/update event
+ cur.execute('SELECT event_id FROM event WHERE source_url=%(source_url)s',
+ event)
+ if pretend:
+ pass
+ row = cur.fetchone()
+ if row:
+ event['event_id'] = row[0]
+ cur.execute('UPDATE event SET'
+ ' source_url=%(source_url)s,'
+ ' conference_room=%(conference_room)s,'
+ ' title=%(title)s,'
+ ' description=%(description)s,'
+ ' start_time=%(start_time)s,'
+ ' duration=%(duration)s'
+ ' WHERE event_id=%(event_id)d',
+ event)
+ else:
+ cur.execute('INSERT INTO event(source_url, conference_id,'
+ ' conference_room, title, description,'
+ ' start_time, duration)'
+ ' VALUES(%(source_url)s, %(conference_id)d,'
+ ' %(conference_room)s, %(title)s,'
+ ' %(description)s, '
+ ' %(start_time)s,'
+ ' %(duration)s)',
+ event)
+
+ cur.execute('COMMIT')
+ cur.execute('BEGIN')
+
+if __name__ == '__main__':
+ try:
+ sys.path.insert(0, '/usr/share/debconf-video-store')
+ import shellconfig
+ config = shellconfig.read_file('/etc/default/debconf-video')
+ main(pretend=('--dry-run' in sys.argv[1:]))
+ except Exception, e:
+ print >>sys.stderr, 'ERROR:', e.__class__.__name__, e
+ sys.exit(2)
Property changes on: package/branches/pycon09/src/pycon-import-events
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mergeinfo
+
Deleted: package/branches/pycon09/src/pycon-import-talks
===================================================================
--- package/branches/pycon09/src/pycon-import-talks 2009-03-19 00:23:14 UTC (rev 334)
+++ package/branches/pycon09/src/pycon-import-talks 2009-03-19 01:03:31 UTC (rev 335)
@@ -1,114 +0,0 @@
-#!/usr/bin/python
-
-# Import talks from public schedule in ICS format
-
-import os
-import pgdb
-import pytz
-import re
-import sys
-import urllib2
-import vobject
-
-SOURCE_URL = 'http://us.pycon.org/2009/conference/schedule/ical/'
-
-_connection = None
-
-def get_cursor():
- global _connection
- if not _connection:
- _connection = pgdb.connect(dsn=config['DATABASE_DSN'],
- user=config.get('DATABASE_USER'),
- password=config.get('DATABASE_PASSWORD'))
- return _connection.cursor()
-
-def get_events_vobject(url):
- data = urllib2.urlopen(url).read()
- for node in vobject.readOne(data).getChildren():
- if node.name == 'VEVENT':
- yield dict((attr.name, attr.value) for attr in node.getChildren())
-
-def get_events():
- local_tz = pytz.timezone('America/Chicago')
-
- for vevent in get_events_vobject(SOURCE_URL):
- event = {'conference_id': 1}
-
- # Drop id number found in some summaries.
- event['title'] = re.sub(r' \(#\d+\)$', '', vevent['SUMMARY'])
-
- # Drop metadata at top of description, but hold onto the room name.
- # Drop events which don't have this metadata, as they are such
- # exciting talks as 'Lunch'.
- match = re.match(r'Room: (?!None\n)([^\n]*)\n'
- r'Presenters: [^\n]*\n'
- r'\d+min [^\n]*\n'
- r'(?:categories: [^\n]*\n)?'
- r'(.*)',
- vevent['DESCRIPTION'])
- if not match:
- print 'INFO: Ignoring', event['title']
- continue
- event['conference_room'] = match.group(1)
- event['description'] = match.group(2)
-
- event['source_url'] = SOURCE_URL + '#' + vevent['UID']
-
- event['start_time'] = vevent['DTSTART'].astimezone(local_tz).isoformat(' ')
- event['duration'] = '%d SECONDS' % (vevent['DTEND'] - vevent['DTSTART']).seconds
-
- yield event
-
-def main(pretend=False):
- cur = get_cursor()
-
- for event in get_events():
- # Insert room if necessary
- cur.execute('SELECT COUNT(*) FROM conference_room'
- ' WHERE conference_room=%(conference_room)s',
- event)
- if not pretend and not cur.fetchone()[0]:
- cur.execute('INSERT INTO conference_room(conference_id, conference_room)'
- ' VALUES (%(conference_id)d, %(conference_room)s)',
- event)
-
- # Insert/update event
- cur.execute('SELECT event_id FROM event WHERE source_url=%(source_url)s',
- event)
- if pretend:
- pass
- row = cur.fetchone()
- if row:
- event['event_id'] = row[0]
- cur.execute('UPDATE event SET'
- ' source_url=%(source_url)s,'
- ' conference_room=%(conference_room)s,'
- ' title=%(title)s,'
- ' description=%(description)s,'
- ' start_time=%(start_time)s,'
- ' duration=%(duration)s'
- ' WHERE event_id=%(event_id)d',
- event)
- else:
- cur.execute('INSERT INTO event(source_url, conference_id,'
- ' conference_room, title, description,'
- ' start_time, duration)'
- ' VALUES(%(source_url)s, %(conference_id)d,'
- ' %(conference_room)s, %(title)s,'
- ' %(description)s, '
- ' %(start_time)s,'
- ' %(duration)s)',
- event)
-
- cur.execute('COMMIT')
- cur.execute('BEGIN')
-
-if __name__ == '__main__':
- try:
- sys.path.insert(0, '/usr/share/debconf-video-store')
- import shellconfig
- config = shellconfig.read_file('/etc/default/debconf-video')
- main(pretend=('--dry-run' in sys.argv[1:]))
- except Exception, e:
- print >>sys.stderr, 'ERROR:', e.__class__.__name__, e
- sys.exit(2)
More information about the Debconf-video-commits
mailing list