Merge pull request #1052 from gabyx/master

Changed regex parsing in embedhtml.py to XML parsing with lxml, regex…
This commit is contained in:
Josh Barnes
2017-10-05 11:52:22 +01:00
committed by GitHub
5 changed files with 73 additions and 53 deletions
+6 -2
View File
@@ -8,6 +8,10 @@ cache:
environment:
COVERALLS_REPO_TOKEN:
secure: lFyaxdbvCvXKM+PjmN9FToU8DhsdS474RgaW/bNAu4IBnn7QbfZzDYrjKw33V6Oo
global:
# lxml will not build appropriately from source on Windows without the
# appropriate libxml headers. As a result, make pip use binary packages.
PIP_ONLY_BINARY: lxml
matrix:
- TOXENV: 'py27-notebook'
@@ -39,13 +43,13 @@ environment:
PYTHON_HOME: C:\Python34
PYTHON_VERSION: '3.4'
PYTHON_ARCH: '32'
- TOXENV: 'py34-notebook43'
TOXPYTHON: C:\Python34\python.exe
PYTHON_HOME: C:\Python34
PYTHON_VERSION: '3.4'
PYTHON_ARCH: '32'
- TOXENV: 'py34-notebook44'
TOXPYTHON: C:\Python34\python.exe
PYTHON_HOME: C:\Python34
+1
View File
@@ -33,6 +33,7 @@ requirements:
- setuptools
- tornado
- traitlets >=4.1
- lxml >=3.8.0
test:
imports:
+1
View File
@@ -72,6 +72,7 @@ if you encounter any problems, and create a new issue if needed!
'pyyaml',
'tornado',
'traitlets >=4.1',
'lxml >=3.8.0'
],
extras_require={
'test': [
@@ -2,8 +2,8 @@
import base64
import os
import re
import lxml.etree as et
from ipython_genutils.ipstruct import Struct
from nbconvert.exporters.html import HTMLExporter
@@ -25,15 +25,20 @@ class EmbedHTMLExporter(HTMLExporter):
jupyter nbconvert --to html_embed mynotebook.ipynb
"""
def replfunc(self, match):
def replfunc(self, node):
"""Replace source url or file link with base64 encoded blob."""
url = match.group(1)
url = node.attrib["src"]
imgformat = url.split('.')[-1]
b64_data = None
prefix = None
if url.startswith('data'):
return # Already in base64 Format
self.log.info("try embedding url: %s, format: %s" % (url, imgformat))
if url.startswith('http'):
data = urlopen(url).read()
elif url.startswith('data'):
img = '<img src="' + url + '"'
return img
b64_data = base64.b64encode(urlopen(url).read()).decode("utf-8")
elif url.startswith('attachment'):
imgname = url.split(':')[1]
available_formats = self.attachments[imgname]
@@ -41,40 +46,47 @@ class EmbedHTMLExporter(HTMLExporter):
for imgformat in self.config.NbConvertBase.display_data_priority:
if imgformat in available_formats.keys():
b64_data = self.attachments[imgname][imgformat]
img = '<img src="data:' + imgformat + \
';base64,' + b64_data + '"'
return img
raise ValueError(
'Could not find attachment for image "%s" in notebook' %
imgname)
prefix = "data:%s;base64," % imgformat
if b64_data is None:
raise ValueError("""Could not find attachment for image '%s'
in notebook""" % imgname)
else:
filename = os.path.join(self.path, url)
with open(filename, 'rb') as f:
data = f.read()
b64_data = base64.b64encode(f.read()).decode("utf-8")
self.log.info("embedding url: %s, format: %s" % (url, imgformat))
b64_data = base64.b64encode(data).decode("utf-8")
if imgformat == "svg":
img = '<img src="data:image/svg+xml;base64,' + \
b64_data + '"'
elif imgformat == "pdf":
img = '<img src="data:application/pdf;base64,' + \
b64_data + '"'
else:
img = '<img src="data:image/' + imgformat + \
';base64,' + b64_data + '"'
return img
if prefix is None:
if imgformat == "svg":
prefix = "data:image/svg+xml;base64,"
elif imgformat == "pdf":
prefix = "data:application/pdf;base64,"
else:
prefix = "data:image/" + imgformat + ';base64,'
node.attrib["src"] = prefix + b64_data
def from_notebook_node(self, nb, resources=None, **kw):
output, resources = super(
EmbedHTMLExporter, self).from_notebook_node(nb, resources)
self.path = resources['metadata']['path']
# Get attachments
self.attachments = Struct()
for cell in nb.cells:
if 'attachments' in cell.keys():
self.attachments += cell['attachments']
regex = re.compile('<img\s+src="([^"]+)"')
embedded_output = regex.sub(self.replfunc, output)
# Parse HTML and replace <img> tags with the embedded data
parser = et.HTMLParser()
root = et.fromstring(output, parser=parser)
nodes = root.findall(".//img")
for n in nodes:
self.replfunc(n)
# Convert back to HTML
embedded_output = et.tostring(root.getroottree(),
method="html",
encoding='unicode')
return embedded_output, resources
+25 -23
View File
@@ -5,6 +5,7 @@ import io
import os
from functools import wraps
from lxml import etree as et
from nbconvert.tests.base import TestsBase
from nbformat import v4, write
@@ -24,32 +25,21 @@ def _with_tmp_cwd(func):
class TestNbConvertExporters(TestsBase):
def check_stuff_gets_embedded(self, nb, exporter_name, to_be_included=[]):
def check_html(self, nb, exporter_name, check_func):
nb_basename = 'notebook'
nb_src_filename = nb_basename + '.ipynb'
with io.open(nb_src_filename, 'w', encoding='utf-8') as f:
write(nb, f, 4)
# convert with default exporter
self.nbconvert('--to {} "{}"'.format('html', nb_src_filename))
nb_dst_filename = nb_basename + '.html'
assert os.path.isfile(nb_dst_filename)
statinfo = os.stat(nb_dst_filename)
os.remove(nb_dst_filename)
# convert with embedding exporter
nb_dst_filename = nb_basename + '.html'
self.nbconvert('--to {} "{}"'.format(exporter_name, nb_src_filename))
statinfo_e = os.stat(nb_dst_filename)
assert os.path.isfile(nb_dst_filename)
assert statinfo_e.st_size > statinfo.st_size
with io.open(nb_dst_filename, 'r', encoding='utf-8') as f:
with open(nb_dst_filename, 'rb') as f:
embedded_nb = f.read()
for txt in to_be_included:
assert txt in embedded_nb
parser = et.HTMLParser()
root = et.fromstring(embedded_nb, parser=parser)
check_func(byte_string=embedded_nb, root_node=root)
@_with_tmp_cwd
def test_embedhtml(self):
@@ -60,8 +50,14 @@ class TestNbConvertExporters(TestsBase):
source="![testimage]({})".format(path_in_data('icon.png'))
),
])
self.check_stuff_gets_embedded(
nb, 'html_embed', to_be_included=['base64'])
def check(byte_string, root_node):
nodes = root_node.findall(".//img")
for n in nodes:
url = n.attrib["src"]
assert url.startswith('data')
self.check_html(nb, 'html_embed', check_func=check)
@_with_tmp_cwd
def test_htmltoc2(self):
@@ -70,8 +66,11 @@ class TestNbConvertExporters(TestsBase):
v4.new_code_cell(source="a = 'world'"),
v4.new_markdown_cell(source="# Heading"),
])
self.check_stuff_gets_embedded(
nb, 'html_toc', to_be_included=['toc2'])
def check(byte_string, root_node):
assert b'toc2' in byte_string
self.check_html(nb, 'html_toc', check_func=check)
@_with_tmp_cwd
def test_html_collapsible_headings(self):
@@ -84,5 +83,8 @@ class TestNbConvertExporters(TestsBase):
v4.new_markdown_cell(source=('### level 3 heading')),
v4.new_code_cell(source='a = range(1,10)'),
])
self.check_stuff_gets_embedded(
nb, 'html_ch', to_be_included=['collapsible_headings'])
def check(byte_string, root_node):
assert b'collapsible_headings' in byte_string
self.check_html(nb, 'html_ch', check_func=check)