Merge pull request #1052 from gabyx/master

Changed regex parsing in embedhtml.py to XML parsing with lxml, regex…
2026-06-27 16:10:24 +08:00 · 2017-10-05 11:52:22 +01:00
parent 527a3e4643 27e58f8fb2
commit 375a9be7b1
5 changed files with 73 additions and 53 deletions
@@ -8,6 +8,10 @@ cache:
 environment:
  COVERALLS_REPO_TOKEN:
    secure: lFyaxdbvCvXKM+PjmN9FToU8DhsdS474RgaW/bNAu4IBnn7QbfZzDYrjKw33V6Oo
+  global:
+    # lxml will not build appropriately from source on Windows without the
+    # appropriate libxml headers. As a result, make pip use binary packages.
+    PIP_ONLY_BINARY: lxml
  matrix:

    - TOXENV: 'py27-notebook'
@@ -39,13 +43,13 @@ environment:
      PYTHON_HOME: C:\Python34
      PYTHON_VERSION: '3.4'
      PYTHON_ARCH: '32'
-    
+
    - TOXENV: 'py34-notebook43'
      TOXPYTHON: C:\Python34\python.exe
      PYTHON_HOME: C:\Python34
      PYTHON_VERSION: '3.4'
      PYTHON_ARCH: '32'
-    
+
    - TOXENV: 'py34-notebook44'
      TOXPYTHON: C:\Python34\python.exe
      PYTHON_HOME: C:\Python34
@@ -33,6 +33,7 @@ requirements:
    - setuptools
    - tornado
    - traitlets >=4.1
+    - lxml >=3.8.0

 test:
  imports:
@@ -72,6 +72,7 @@ if you encounter any problems, and create a new issue if needed!
            'pyyaml',
            'tornado',
            'traitlets >=4.1',
+            'lxml >=3.8.0'
        ],
        extras_require={
            'test': [
@@ -2,8 +2,8 @@

 import base64
 import os
-import re

+import lxml.etree as et
 from ipython_genutils.ipstruct import Struct
 from nbconvert.exporters.html import HTMLExporter

@@ -25,15 +25,20 @@ class EmbedHTMLExporter(HTMLExporter):
        jupyter nbconvert --to html_embed mynotebook.ipynb
    """

-    def replfunc(self, match):
+    def replfunc(self, node):
        """Replace source url or file link with base64 encoded blob."""
-        url = match.group(1)
+        url = node.attrib["src"]
        imgformat = url.split('.')[-1]
+        b64_data = None
+        prefix = None
+
+        if url.startswith('data'):
+            return  # Already in base64 Format
+
+        self.log.info("try embedding url: %s, format: %s" % (url, imgformat))
+
        if url.startswith('http'):
-            data = urlopen(url).read()
-        elif url.startswith('data'):
-            img = '<img src="' + url + '"'
-            return img
+            b64_data = base64.b64encode(urlopen(url).read()).decode("utf-8")
        elif url.startswith('attachment'):
            imgname = url.split(':')[1]
            available_formats = self.attachments[imgname]
@@ -41,40 +46,47 @@ class EmbedHTMLExporter(HTMLExporter):
            for imgformat in self.config.NbConvertBase.display_data_priority:
                if imgformat in available_formats.keys():
                    b64_data = self.attachments[imgname][imgformat]
-                    img = '<img src="data:' + imgformat + \
-                          ';base64,' + b64_data + '"'
-                    return img
-            raise ValueError(
-                'Could not find attachment for image "%s" in notebook' %
-                imgname)
+                    prefix = "data:%s;base64," % imgformat
+            if b64_data is None:
+                raise ValueError("""Could not find attachment for image '%s'
+                                    in notebook""" % imgname)
        else:
            filename = os.path.join(self.path, url)
            with open(filename, 'rb') as f:
-                data = f.read()
+                b64_data = base64.b64encode(f.read()).decode("utf-8")

-        self.log.info("embedding url: %s, format: %s" % (url, imgformat))
-        b64_data = base64.b64encode(data).decode("utf-8")
-        if imgformat == "svg":
-            img = '<img src="data:image/svg+xml;base64,' + \
-                b64_data + '"'
-        elif imgformat == "pdf":
-            img = '<img src="data:application/pdf;base64,' + \
-                b64_data + '"'
-        else:
-            img = '<img src="data:image/' + imgformat + \
-                ';base64,' + b64_data + '"'
-        return img
+        if prefix is None:
+            if imgformat == "svg":
+                prefix = "data:image/svg+xml;base64,"
+            elif imgformat == "pdf":
+                prefix = "data:application/pdf;base64,"
+            else:
+                prefix = "data:image/" + imgformat + ';base64,'
+
+        node.attrib["src"] = prefix + b64_data

    def from_notebook_node(self, nb, resources=None, **kw):
        output, resources = super(
            EmbedHTMLExporter, self).from_notebook_node(nb, resources)

        self.path = resources['metadata']['path']
+
+        # Get attachments
        self.attachments = Struct()
        for cell in nb.cells:
            if 'attachments' in cell.keys():
                self.attachments += cell['attachments']
-        regex = re.compile('<img\s+src="([^"]+)"')

-        embedded_output = regex.sub(self.replfunc, output)
+        # Parse HTML and replace <img> tags with the embedded data
+        parser = et.HTMLParser()
+        root = et.fromstring(output, parser=parser)
+        nodes = root.findall(".//img")
+        for n in nodes:
+            self.replfunc(n)
+
+        # Convert back to HTML
+        embedded_output = et.tostring(root.getroottree(),
+                                      method="html",
+                                      encoding='unicode')
+
        return embedded_output, resources
@@ -5,6 +5,7 @@ import io
 import os
 from functools import wraps

+from lxml import etree as et
 from nbconvert.tests.base import TestsBase
 from nbformat import v4, write

@@ -24,32 +25,21 @@ def _with_tmp_cwd(func):

 class TestNbConvertExporters(TestsBase):

-    def check_stuff_gets_embedded(self, nb, exporter_name, to_be_included=[]):
+    def check_html(self, nb, exporter_name, check_func):
        nb_basename = 'notebook'
        nb_src_filename = nb_basename + '.ipynb'
        with io.open(nb_src_filename, 'w', encoding='utf-8') as f:
            write(nb, f, 4)

-        # convert with default exporter
-        self.nbconvert('--to {} "{}"'.format('html', nb_src_filename))
-        nb_dst_filename = nb_basename + '.html'
-        assert os.path.isfile(nb_dst_filename)
-        statinfo = os.stat(nb_dst_filename)
-
-        os.remove(nb_dst_filename)
-
        # convert with embedding exporter
+        nb_dst_filename = nb_basename + '.html'
        self.nbconvert('--to {} "{}"'.format(exporter_name, nb_src_filename))
-        statinfo_e = os.stat(nb_dst_filename)
-        assert os.path.isfile(nb_dst_filename)

-        assert statinfo_e.st_size > statinfo.st_size
-
-        with io.open(nb_dst_filename, 'r', encoding='utf-8') as f:
+        with open(nb_dst_filename, 'rb') as f:
            embedded_nb = f.read()
-
-        for txt in to_be_included:
-            assert txt in embedded_nb
+            parser = et.HTMLParser()
+            root = et.fromstring(embedded_nb, parser=parser)
+            check_func(byte_string=embedded_nb, root_node=root)

    @_with_tmp_cwd
    def test_embedhtml(self):
@@ -60,8 +50,14 @@ class TestNbConvertExporters(TestsBase):
                source="![testimage]({})".format(path_in_data('icon.png'))
            ),
        ])
-        self.check_stuff_gets_embedded(
-            nb, 'html_embed', to_be_included=['base64'])
+
+        def check(byte_string, root_node):
+            nodes = root_node.findall(".//img")
+            for n in nodes:
+                url = n.attrib["src"]
+                assert url.startswith('data')
+
+        self.check_html(nb, 'html_embed', check_func=check)

    @_with_tmp_cwd
    def test_htmltoc2(self):
@@ -70,8 +66,11 @@ class TestNbConvertExporters(TestsBase):
            v4.new_code_cell(source="a = 'world'"),
            v4.new_markdown_cell(source="# Heading"),
        ])
-        self.check_stuff_gets_embedded(
-            nb, 'html_toc', to_be_included=['toc2'])
+
+        def check(byte_string, root_node):
+            assert b'toc2' in byte_string
+
+        self.check_html(nb, 'html_toc', check_func=check)

    @_with_tmp_cwd
    def test_html_collapsible_headings(self):
@@ -84,5 +83,8 @@ class TestNbConvertExporters(TestsBase):
            v4.new_markdown_cell(source=('### level 3 heading')),
            v4.new_code_cell(source='a = range(1,10)'),
        ])
-        self.check_stuff_gets_embedded(
-            nb, 'html_ch', to_be_included=['collapsible_headings'])
+
+        def check(byte_string, root_node):
+            assert b'collapsible_headings' in byte_string
+
+        self.check_html(nb, 'html_ch', check_func=check)