[sagenb] 93/179: Add Pablo's changes responding to review Various changes related to regex, loops, etc. for better parsing of sws html into rst
felix salfelder
felix-guest at moszumanska.debian.org
Tue May 6 12:05:13 UTC 2014
This is an automated email from the git hooks/post-receive script.
felix-guest pushed a commit to branch master
in repository sagenb.
commit 9b1cf3f0460c94c60578e87893e7d5109fe1caea
Author: Karl Crisman <kcrisman at gmail.com>
Date: Thu Apr 11 22:09:39 2013 -0400
Add Pablo's changes responding to review
Various changes related to regex, loops, etc. for better
parsing of sws html into rst
---
sagenb/misc/comments2rst.py | 119 +++++++++++++++++++++++++++----------------
sagenb/misc/worksheet2rst.py | 14 +++--
2 files changed, 86 insertions(+), 47 deletions(-)
diff --git a/sagenb/misc/comments2rst.py b/sagenb/misc/comments2rst.py
index 77f812e..3db1545 100644
--- a/sagenb/misc/comments2rst.py
+++ b/sagenb/misc/comments2rst.py
@@ -33,6 +33,8 @@ or by using one of
in the Sage shell (sage --sh).
"""
+#negative lookbehind: http://www.regular-expressions.info/lookaround.html
+double_dollar = re.compile(r'(?<!\\)\$\$')
def preprocess_display_latex(text):
r"""replace $$some display latex$$ with <display>some display latex</display>
before the soup is built.
@@ -58,8 +60,8 @@ def preprocess_display_latex(text):
"""
ls = []
start_tag = True
- partes = text.split('$$')
- for c in partes[:-1]:
+ parts = double_dollar.split(text)
+ for c in parts[:-1]:
if start_tag:
ls.append(c)
ls.append('<display>')
@@ -74,7 +76,7 @@ def preprocess_display_latex(text):
elif abs(count)>1:
raise Exception, 'display latex was messed up with html code'
start_tag = not start_tag
- ls.append(partes[-1])
+ ls.append(parts[-1])
return ''.join(ls)
def prune_tags(text):
@@ -90,11 +92,25 @@ def escape_chars(text):
text = text.replace(c,r)
return text
+#This is supposed to be handled by BeautifulSoup, but doesn't work
+xml_entities = {'<':'<',
+ '>':'>',
+ '&':'&',
+ '"':'"',
+ ''':"'",
+}
+def replace_xml_entities(text):
+ for c,r in xml_entities.iteritems():
+ text = text.replace(c,r)
+ return text
+
+
def replace_courier(soup):
"""Lacking a better option, I use courier font to mark <code>
within tinyMCE. And I want to turn that into real code tags.
- Most users won't be needing this(?)
+ Most users won't be needing this(?), so this code is not called anywhere
+ but kept for reference
"""
for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']):
tag = Tag(soup, 'code')
@@ -102,9 +118,8 @@ def replace_courier(soup):
tag.append(t.contents[0])
t.replaceWith(tag)
-#inline_latex is careful not to confuse escaped dollars
-inline_latex = re.compile(r'([^\\])\$(.*?)([^\\])\$')
-latex_beginning = re.compile(r'\$(.*?)([^\\])\$')
+#negative lookbehind: http://www.regular-expressions.info/lookaround.html
+single_dollar = re.compile(r'(?<!\\)\$')
def replace_latex(soup):
r"""Replaces inline latex by :math:`code` and escapes
some rst special chars like +, -, * and | outside of inline latex
@@ -125,17 +140,14 @@ def replace_latex(soup):
<p><strong>2\+2 \| 1\+3</strong></p>
"""
for t in soup.findAll(text=re.compile('.+')):
- if latex_beginning.match(t):
- t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`',
- latex_beginning.sub(':math:`\\1\\2`',
- unicode(t),
- 1)))
- elif inline_latex.search(t):
- t.replaceWith(inline_latex.sub('\\1:math:`\\2\\3`',
- unicode(t)))
- elif not (t.fetchParents(name = 'display')
- or t.fetchParents(name = 'pre')):
- t.replaceWith(escape_chars(t))
+ if (t.fetchParents(name = 'display') or
+ t.fetchParents(name = 'pre') ):
+ continue
+ parts = single_dollar.split(unicode(t))
+ even = [escape_chars(parts[i]) for i in range(0,len(parts),2)]
+ odd = [' :math:`%s`'%parts[i] for i in range(1,len(parts),2)]
+ odd.append('')
+ t.replaceWith(''.join(''.join(p) for p in zip(even,odd) ))
class Soup2Rst(object):
"""builds the rst text from the Soup Tree
@@ -144,7 +156,7 @@ class Soup2Rst(object):
'h2':'header',
'h3':'header',
'h4':'header',
- 'p': 'inline_no_tag',
+ 'p': 'p',
'[document]': 'document',
'br': 'br',
'b':'strong',
@@ -170,14 +182,15 @@ class Soup2Rst(object):
headers = {'h1':u'=',
'h2':u'-',
- 'h3':u'~',
+ 'h3':u'^',
'h4':u'"',
+ 'h5':u'~',
}
def __init__(self, images_dir):
self.images_dir = images_dir
- self._nested_list = 0
- self._inside_ol = False
+ self._nested_list = -1
+ self._inside_ol_or_ul = []
self._inside_code_tag = False
def visit(self, node):
@@ -206,7 +219,7 @@ class Soup2Rst(object):
return t.replace('\n','')
def visit_header(self, node):
- s = ' '.join(self.visit(tag) for tag in node.contents)
+ s = ''.join(self.visit(tag) for tag in node.contents)
spacer = self.headers[node.name]*len(s)
return s.replace( '\n', '') + '\n' + spacer
@@ -215,27 +228,30 @@ class Soup2Rst(object):
def visit_ul(self, node):
self._nested_list += 1
- result = '\n'.join(self.visit(tag) for tag in node.contents)
+ self._inside_ol_or_ul.append(False)
+ result = '\n\n'+''.join(self.visit(tag) for tag in node.contents)+'\n'
+ self._inside_ol_or_ul.pop()
self._nested_list -= 1
return result
def visit_ol(self, node):
self._nested_list += 1
- self._inside_ol = True
- result = '\n'.join(self.visit(tag) for tag in node.contents)
+ self._inside_ol_or_ul.append(True)
+ result = '\n\n'+''.join(self.visit(tag) for tag in node.contents)+'\n'
+ self._inside_ol_or_ul.pop()
self._nested_list -= 1
- self._inside_ol = False
return result
def visit_li(self, node):
return (' '*self._nested_list
- + ('#. ' if self._inside_ol else '- ')
- +' '.join(self.visit(tag) for tag in node.contents))
+ + ('#. ' if self._inside_ol_or_ul[-1] else '- ')
+ +' '.join(self.visit(tag) for tag in node.contents)
+ + '\n')
def visit_display(self, node):
- return ('\n.. MATH::\n\n ' +
+ return ('\n\n.. MATH::\n\n ' +
unicode(node)[9:-10].replace('<br></br>','\n').replace('\n','\n ') +
- '\n\n')
+ '\n\n.. end of math\n\n')
def visit_img(self, node):
return '.. image:: ' + os.path.join(self.images_dir, node['src'].replace(' ','_')) + '\n :align: center\n'
@@ -251,7 +267,7 @@ class Soup2Rst(object):
if hasattr(row,'name') and
row.name=='tr')
rows.append([]) #this row represents a separator
- elif elt.name == 'tbody':
+ elif (elt.name == 'tbody') or (elt.name == 'tfoot'):
rows.extend(self.prepare_tr(row)
for row in elt
if hasattr(row,'name') and
@@ -285,7 +301,7 @@ class Soup2Rst(object):
def visit_strong(self, node):
if node.contents:
content = ' '.join(self.visit(tag) for tag in node.contents).strip()
- if '``' in content or self._inside_code_tag:
+ if '``' in content:
return content
else:
return '**' + content + '**'
@@ -294,30 +310,38 @@ class Soup2Rst(object):
def visit_em(self,node):
if node.contents:
- return '*' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '*'
+ return ' *' + ' '.join(self.visit(tag) for tag in node.contents).strip() + '* '
else:
return ''
def visit_code(self, node):
if node.contents:
- self._inside_code_tag = True
content = self.get_plain_text(node).strip()
- self._inside_code_tag = False
return '``' + content + '``'
else:
return ''
def visit_inline_no_tag(self, node):
return (' '.join(self.visit(tag)
- for tag in node.contents)).strip() + '\n'
+ for tag in node.contents)).strip()
def visit_block_no_tag(self, node):
- return '\n'.join(self.visit(tag) for tag in node.contents)
+ return '\n'.join(self.visit(tag) for tag in node.contents) + '\n'
+
+ def visit_p(self, node):
+ return ''.join(self.visit(tag) for tag in node.contents) + '\n\n'
def visit_a(self, node):
- return ('`' + ' '.join(self.visit(tag) for tag in node.contents) +
- ' <' + node['href'] + '>`_'
- )
+ c = ' '.join(self.visit(tag) for tag in node.contents)
+ try:
+ link = node['href']
+ if link[0]=='#':
+ return ':ref:`%s <%s>`'%(c, link[1:])
+ else:
+ return '`%s <%s>`_'%(c, link)
+ except KeyError:
+ return '.. _%s:\n\n'%node['name']
+
def html2rst(text, images_dir):
"""Converts html, tipically generated by tinyMCE, into rst
@@ -364,14 +388,21 @@ def html2rst(text, images_dir):
#ICantBelieveItsBeautifulSoup is better than BeautifulSoup
#for html that wasn't generated by humans (like tinyMCE)
soup = ICantBelieveItsBeautifulSoup(text,
- convertEntities=ICantBelieveItsBeautifulSoup.HTML_ENTITIES)
+ convertEntities=ICantBelieveItsBeautifulSoup.ALL_ENTITIES)
#remove all comments
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
- replace_courier(soup)
+# replace_courier(soup)
replace_latex(soup)
v = Soup2Rst(images_dir)
- return v.visit(soup)
+
+# return v.visit(soup)
+ text = v.visit(soup)
+ more_than_2_blank_lines = re.compile(r'\n\n+', re.MULTILINE)
+ text = more_than_2_blank_lines.sub('\n\n', text)
+ text = replace_xml_entities(text)
+ return text
+
diff --git a/sagenb/misc/worksheet2rst.py b/sagenb/misc/worksheet2rst.py
index f12fe3e..4f1de9d 100644
--- a/sagenb/misc/worksheet2rst.py
+++ b/sagenb/misc/worksheet2rst.py
@@ -116,6 +116,13 @@ def code_parser(text):
lines.append(prefix + l)
return '\n'.join(lines)
+HEADER_RE = re.compile(r'<h\d>')
+def add_title_if_there_is_none(text):
+ if not HEADER_RE.search(text):
+ return '<h1>Please write a title for this worksheet!</h1>\n' + text
+ else:
+ return text
+
def worksheet2rst(s, images_dir=''):
"""Parses a string, tipically the content of the file
worksheet.html inside a sws file, and converts it into
@@ -142,7 +149,7 @@ def worksheet2rst(s, images_dir=''):
: worksheet2rst(s)
u'.. -*- coding: utf-8 -*-\n\n\n::\n\n sage: show(f)\n\n.. MATH::\n\n \\sqrt{x}\n\n.. end of output\n'
"""
- result_parser = results2rst
+ s = add_title_if_there_is_none(s)
state = States.COMMENT
result = ['.. -*- coding: utf-8 -*-\n']
ls = []
@@ -157,7 +164,7 @@ def worksheet2rst(s, images_dir=''):
result.append(html2rst(u'\n'.join(ls), img_path))
elif state == States.RESULT:
img_path = os.path.join(images_dir, 'cell_%s_'%last_cell_id)
- result.append(result_parser(u'\n'.join(ls),
+ result.append(results2rst(u'\n'.join(ls),
img_path))
result.append('')
result.append('.. end of output')
@@ -191,6 +198,7 @@ if __name__=='__main__':
fichero.close()
else:
text = sys.stdin.read()
+ images_dir = sys.argv[2] if len(sys.argv)>2 else ''
- print worksheet2rst(text).encode('utf-8')
+ print worksheet2rst(text, images_dir).encode('utf-8')
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/sagenb.git
More information about the debian-science-commits
mailing list