[sagenb] 28/179: Sanitize the output and text cells of published worksheets.

felix salfelder felix-guest at moszumanska.debian.org
Tue May 6 12:05:07 UTC 2014


This is an automated email from the git hooks/post-receive script.

felix-guest pushed a commit to branch master
in repository sagenb.

commit 77fb49a0f585f01b5bc4bc26dfca7e64499aef78
Author: Jason Grout <jason.grout at drake.edu>
Date:   Thu Oct 4 16:00:50 2012 -0500

    Sanitize the output and text cells of published worksheets.
---
 sagenb/data/sage/html/notebook/cell.html      |  8 ++--
 sagenb/data/sage/html/notebook/text_cell.html |  2 +-
 sagenb/notebook/cell.py                       | 53 ++++++++++++++++++++++-----
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/sagenb/data/sage/html/notebook/cell.html b/sagenb/data/sage/html/notebook/cell.html
index 0027932..a2fcdbd 100644
--- a/sagenb/data/sage/html/notebook/cell.html
+++ b/sagenb/data/sage/html/notebook/cell.html
@@ -96,20 +96,20 @@ INPUT:
                             <div class="cell_output_{{ "print_" if do_print else '' }}{{ cell.cell_output_type() }}"
                                 id="cell_output_{{ cell.id() }}">
                                 {% if cell.introspect() %}
-                                    {{ cell.output_text(0, html=true) }}
+                                    {{ cell.output_text(0, html=true, sanitize=publish) }}
                                 {% else %}
-                                    {{ cell.output_text(wrap_, html=true) }}
+                                    {{ cell.output_text(wrap_, html=true, sanitize=publish) }}
                                 {% endif %}
                             </div>
                             {% if not do_print %}
                                 <div class="cell_output_{{ 'print_' if do_print else '' }}nowrap_{{ cell.cell_output_type() }}"
                                      id="cell_output_nowrap_{{ cell.id() }}">
-                                    {{ cell.output_text(0, html=true) }}
+                                    {{ cell.output_text(0, html=true, sanitize=publish) }}
                                 </div>
                             {% endif %}
                                 <div class="cell_output_html_{{ cell.cell_output_type() }}"
                                      id="cell_output_html_{{ cell.id() }}">
-                                    {{ cell.output_html() }}
+                                    {{ cell.output_html(sanitize=publish) }}
                                 </div>
                         </div>
                     </td>
diff --git a/sagenb/data/sage/html/notebook/text_cell.html b/sagenb/data/sage/html/notebook/text_cell.html
index dffc6bc..8e00d76 100644
--- a/sagenb/data/sage/html/notebook/text_cell.html
+++ b/sagenb/data/sage/html/notebook/text_cell.html
@@ -25,7 +25,7 @@ INPUT:
     </script>
     {% endif %}
     <div class="text_cell" id="cell_text_{{ cell.id() }}">
-      {{ cell.plain_text() }}
+      {{ cell.plain_text(sanitize=publish) }}
     </div>
 {% if JEDITABLE_TINYMCE and not cell.worksheet().is_published() and not cell.worksheet().docbrowser() and not do_print and not publish %}
     <script type="text/javascript">
diff --git a/sagenb/notebook/cell.py b/sagenb/notebook/cell.py
index 1580580..f15afa7 100644
--- a/sagenb/notebook/cell.py
+++ b/sagenb/notebook/cell.py
@@ -47,6 +47,25 @@ re_script = re.compile(r'<script[^>]*?>.*?</script>', re.DOTALL | re.I)
 # Whether to enable editing of :class:`TextCell`s with TinyMCE.
 JEDITABLE_TINYMCE = True
 
+try:
+    from lxml.html.clean import Cleaner
+    from lxml.etree import XMLSyntaxError
+    class SageCleaner(Cleaner):
+        def allow_element(self, el):
+            # Added this one test for mathjax <script> tags
+            if el.tag=='script' and el.get('type')=='math/tex' and not el.get('src'):
+                return True
+            return super(SageCleaner, self).allow_element(el)
+    html_cleaner = SageCleaner(page_structure=False, remove_tags=('head', 'title'), style=True)
+    def clean_html(text):
+        try:
+            return html_cleaner.clean_html(text)
+        except XMLSyntaxError:
+            return ''
+except ImportError:
+    def clean_html(text):
+        # looks ugly, but gets the job done
+        return text.replace('<', '<')
 
 ###########################
 # Generic (abstract) cell #
@@ -563,7 +582,7 @@ class TextCell(Cell_generic):
                         editing = editing, publish = publish)
 
 
-    def plain_text(self, prompts=False):
+    def plain_text(self, prompts=False, sanitize=False):
         ur"""
         Returns a plain text version of this text cell.
 
@@ -585,7 +604,10 @@ class TextCell(Cell_generic):
             sage: C.plain_text()
             u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9\u010f\u010e'
         """
-        return self._text
+        if sanitize:
+            return clean_html(self._text)
+        else:
+            return self._text
 
     def edit_text(self):
         """
@@ -1664,7 +1686,7 @@ class Cell(Cell_generic):
         except AttributeError:
             return None
 
-    def output_html(self):
+    def output_html(self, sanitize=False):
         """
         Returns this compute cell's HTML output.
 
@@ -1682,7 +1704,10 @@ class Cell(Cell_generic):
             u'<strong>5</strong>'
         """
         try:
-            return self._out_html
+            if sanitize:
+                return clean_html(self._out_html)
+            else:
+                return self._out_html
         except AttributeError:
             self._out_html = ''
             return ''
@@ -1716,7 +1741,7 @@ class Cell(Cell_generic):
             urls = urls.replace(s, begin + s[7:-1] + end)
         return urls
 
-    def output_text(self, ncols=0, html=True, raw=False, allow_interact=True):
+    def output_text(self, ncols=0, html=True, raw=False, allow_interact=True, sanitize=False):
         ur"""
         Returns this compute cell's output text.
 
@@ -1733,6 +1758,9 @@ class Cell(Cell_generic):
         - ``allow_interact`` - a boolean (default: True); whether to
           allow :func:`sagenb.notebook.interact.interact`\ ion
 
+        - ``sanitize`` - a boolean (default: False); whether to sanitize
+          the html (if html is selected)
+
         OUTPUT:
 
         - a string
@@ -1757,7 +1785,7 @@ class Cell(Cell_generic):
         """
         if allow_interact and hasattr(self, '_interact_output'):
             # Get the input template
-            z = self.output_text(ncols, html, raw, allow_interact=False)
+            z = self.output_text(ncols, html, raw, allow_interact=False, sanitize=sanitize)
             if not INTERACT_TEXT in z or not INTERACT_HTML in z:
                 return z
             if ncols:
@@ -1765,7 +1793,7 @@ class Cell(Cell_generic):
                 try:
                     # Fill in the output template
                     output, html = self._interact_output
-                    output = self.parse_html(output, ncols)
+                    output = self.parse_html(output, ncols, sanitize=sanitize)
                     z = z.replace(INTERACT_TEXT, output)
                     z = z.replace(INTERACT_HTML, html)
                     return z
@@ -1793,7 +1821,7 @@ class Cell(Cell_generic):
             return s
 
         if html:
-            s = self.parse_html(s, ncols)
+            s = self.parse_html(s, ncols, sanitize=sanitize)
 
         if (not is_interact and not self.is_html() and len(s.strip()) > 0 and
             '<div class="docstring">' not in s):
@@ -1801,7 +1829,7 @@ class Cell(Cell_generic):
 
         return s.strip('\n')
 
-    def parse_html(self, s, ncols):
+    def parse_html(self, s, ncols, sanitize=False):
         r"""
         Parses HTML for output, escaping and wrapping HTML and
         removing script elements.
@@ -1812,6 +1840,8 @@ class Cell(Cell_generic):
 
         - ``ncols`` - an integer; the number of word wrap columns
 
+        - ``sanitize`` - a boolean; sanitize the html
+
         OUTPUT:
 
         - a string
@@ -1829,7 +1859,10 @@ class Cell(Cell_generic):
             return word_wrap(escape(x), ncols)
 
         def format_html(x):
-            return self.process_cell_urls(x)
+            t = self.process_cell_urls(x)
+            if sanitize:
+                t = clean_html(t)
+            return t
 
         # If there is an error in the output, specially format it.
         if not self.is_interactive_cell():

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/sagenb.git



More information about the debian-science-commits mailing list