mirror of
https://github.com/wassname/jupyter_contrib_nbextensions.git
synced 2026-06-27 16:10:24 +08:00
Merge pull request #1052 from gabyx/master
Changed regex parsing in embedhtml.py to XML parsing with lxml, regex…
This commit is contained in:
+6
-2
@@ -8,6 +8,10 @@ cache:
|
||||
environment:
|
||||
COVERALLS_REPO_TOKEN:
|
||||
secure: lFyaxdbvCvXKM+PjmN9FToU8DhsdS474RgaW/bNAu4IBnn7QbfZzDYrjKw33V6Oo
|
||||
global:
|
||||
# lxml will not build appropriately from source on Windows without the
|
||||
# appropriate libxml headers. As a result, make pip use binary packages.
|
||||
PIP_ONLY_BINARY: lxml
|
||||
matrix:
|
||||
|
||||
- TOXENV: 'py27-notebook'
|
||||
@@ -39,13 +43,13 @@ environment:
|
||||
PYTHON_HOME: C:\Python34
|
||||
PYTHON_VERSION: '3.4'
|
||||
PYTHON_ARCH: '32'
|
||||
|
||||
|
||||
- TOXENV: 'py34-notebook43'
|
||||
TOXPYTHON: C:\Python34\python.exe
|
||||
PYTHON_HOME: C:\Python34
|
||||
PYTHON_VERSION: '3.4'
|
||||
PYTHON_ARCH: '32'
|
||||
|
||||
|
||||
- TOXENV: 'py34-notebook44'
|
||||
TOXPYTHON: C:\Python34\python.exe
|
||||
PYTHON_HOME: C:\Python34
|
||||
|
||||
@@ -33,6 +33,7 @@ requirements:
|
||||
- setuptools
|
||||
- tornado
|
||||
- traitlets >=4.1
|
||||
- lxml >=3.8.0
|
||||
|
||||
test:
|
||||
imports:
|
||||
|
||||
@@ -72,6 +72,7 @@ if you encounter any problems, and create a new issue if needed!
|
||||
'pyyaml',
|
||||
'tornado',
|
||||
'traitlets >=4.1',
|
||||
'lxml >=3.8.0'
|
||||
],
|
||||
extras_require={
|
||||
'test': [
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
|
||||
import lxml.etree as et
|
||||
from ipython_genutils.ipstruct import Struct
|
||||
from nbconvert.exporters.html import HTMLExporter
|
||||
|
||||
@@ -25,15 +25,20 @@ class EmbedHTMLExporter(HTMLExporter):
|
||||
jupyter nbconvert --to html_embed mynotebook.ipynb
|
||||
"""
|
||||
|
||||
def replfunc(self, match):
|
||||
def replfunc(self, node):
|
||||
"""Replace source url or file link with base64 encoded blob."""
|
||||
url = match.group(1)
|
||||
url = node.attrib["src"]
|
||||
imgformat = url.split('.')[-1]
|
||||
b64_data = None
|
||||
prefix = None
|
||||
|
||||
if url.startswith('data'):
|
||||
return # Already in base64 Format
|
||||
|
||||
self.log.info("try embedding url: %s, format: %s" % (url, imgformat))
|
||||
|
||||
if url.startswith('http'):
|
||||
data = urlopen(url).read()
|
||||
elif url.startswith('data'):
|
||||
img = '<img src="' + url + '"'
|
||||
return img
|
||||
b64_data = base64.b64encode(urlopen(url).read()).decode("utf-8")
|
||||
elif url.startswith('attachment'):
|
||||
imgname = url.split(':')[1]
|
||||
available_formats = self.attachments[imgname]
|
||||
@@ -41,40 +46,47 @@ class EmbedHTMLExporter(HTMLExporter):
|
||||
for imgformat in self.config.NbConvertBase.display_data_priority:
|
||||
if imgformat in available_formats.keys():
|
||||
b64_data = self.attachments[imgname][imgformat]
|
||||
img = '<img src="data:' + imgformat + \
|
||||
';base64,' + b64_data + '"'
|
||||
return img
|
||||
raise ValueError(
|
||||
'Could not find attachment for image "%s" in notebook' %
|
||||
imgname)
|
||||
prefix = "data:%s;base64," % imgformat
|
||||
if b64_data is None:
|
||||
raise ValueError("""Could not find attachment for image '%s'
|
||||
in notebook""" % imgname)
|
||||
else:
|
||||
filename = os.path.join(self.path, url)
|
||||
with open(filename, 'rb') as f:
|
||||
data = f.read()
|
||||
b64_data = base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
self.log.info("embedding url: %s, format: %s" % (url, imgformat))
|
||||
b64_data = base64.b64encode(data).decode("utf-8")
|
||||
if imgformat == "svg":
|
||||
img = '<img src="data:image/svg+xml;base64,' + \
|
||||
b64_data + '"'
|
||||
elif imgformat == "pdf":
|
||||
img = '<img src="data:application/pdf;base64,' + \
|
||||
b64_data + '"'
|
||||
else:
|
||||
img = '<img src="data:image/' + imgformat + \
|
||||
';base64,' + b64_data + '"'
|
||||
return img
|
||||
if prefix is None:
|
||||
if imgformat == "svg":
|
||||
prefix = "data:image/svg+xml;base64,"
|
||||
elif imgformat == "pdf":
|
||||
prefix = "data:application/pdf;base64,"
|
||||
else:
|
||||
prefix = "data:image/" + imgformat + ';base64,'
|
||||
|
||||
node.attrib["src"] = prefix + b64_data
|
||||
|
||||
def from_notebook_node(self, nb, resources=None, **kw):
|
||||
output, resources = super(
|
||||
EmbedHTMLExporter, self).from_notebook_node(nb, resources)
|
||||
|
||||
self.path = resources['metadata']['path']
|
||||
|
||||
# Get attachments
|
||||
self.attachments = Struct()
|
||||
for cell in nb.cells:
|
||||
if 'attachments' in cell.keys():
|
||||
self.attachments += cell['attachments']
|
||||
regex = re.compile('<img\s+src="([^"]+)"')
|
||||
|
||||
embedded_output = regex.sub(self.replfunc, output)
|
||||
# Parse HTML and replace <img> tags with the embedded data
|
||||
parser = et.HTMLParser()
|
||||
root = et.fromstring(output, parser=parser)
|
||||
nodes = root.findall(".//img")
|
||||
for n in nodes:
|
||||
self.replfunc(n)
|
||||
|
||||
# Convert back to HTML
|
||||
embedded_output = et.tostring(root.getroottree(),
|
||||
method="html",
|
||||
encoding='unicode')
|
||||
|
||||
return embedded_output, resources
|
||||
|
||||
+25
-23
@@ -5,6 +5,7 @@ import io
|
||||
import os
|
||||
from functools import wraps
|
||||
|
||||
from lxml import etree as et
|
||||
from nbconvert.tests.base import TestsBase
|
||||
from nbformat import v4, write
|
||||
|
||||
@@ -24,32 +25,21 @@ def _with_tmp_cwd(func):
|
||||
|
||||
class TestNbConvertExporters(TestsBase):
|
||||
|
||||
def check_stuff_gets_embedded(self, nb, exporter_name, to_be_included=[]):
|
||||
def check_html(self, nb, exporter_name, check_func):
|
||||
nb_basename = 'notebook'
|
||||
nb_src_filename = nb_basename + '.ipynb'
|
||||
with io.open(nb_src_filename, 'w', encoding='utf-8') as f:
|
||||
write(nb, f, 4)
|
||||
|
||||
# convert with default exporter
|
||||
self.nbconvert('--to {} "{}"'.format('html', nb_src_filename))
|
||||
nb_dst_filename = nb_basename + '.html'
|
||||
assert os.path.isfile(nb_dst_filename)
|
||||
statinfo = os.stat(nb_dst_filename)
|
||||
|
||||
os.remove(nb_dst_filename)
|
||||
|
||||
# convert with embedding exporter
|
||||
nb_dst_filename = nb_basename + '.html'
|
||||
self.nbconvert('--to {} "{}"'.format(exporter_name, nb_src_filename))
|
||||
statinfo_e = os.stat(nb_dst_filename)
|
||||
assert os.path.isfile(nb_dst_filename)
|
||||
|
||||
assert statinfo_e.st_size > statinfo.st_size
|
||||
|
||||
with io.open(nb_dst_filename, 'r', encoding='utf-8') as f:
|
||||
with open(nb_dst_filename, 'rb') as f:
|
||||
embedded_nb = f.read()
|
||||
|
||||
for txt in to_be_included:
|
||||
assert txt in embedded_nb
|
||||
parser = et.HTMLParser()
|
||||
root = et.fromstring(embedded_nb, parser=parser)
|
||||
check_func(byte_string=embedded_nb, root_node=root)
|
||||
|
||||
@_with_tmp_cwd
|
||||
def test_embedhtml(self):
|
||||
@@ -60,8 +50,14 @@ class TestNbConvertExporters(TestsBase):
|
||||
source="".format(path_in_data('icon.png'))
|
||||
),
|
||||
])
|
||||
self.check_stuff_gets_embedded(
|
||||
nb, 'html_embed', to_be_included=['base64'])
|
||||
|
||||
def check(byte_string, root_node):
|
||||
nodes = root_node.findall(".//img")
|
||||
for n in nodes:
|
||||
url = n.attrib["src"]
|
||||
assert url.startswith('data')
|
||||
|
||||
self.check_html(nb, 'html_embed', check_func=check)
|
||||
|
||||
@_with_tmp_cwd
|
||||
def test_htmltoc2(self):
|
||||
@@ -70,8 +66,11 @@ class TestNbConvertExporters(TestsBase):
|
||||
v4.new_code_cell(source="a = 'world'"),
|
||||
v4.new_markdown_cell(source="# Heading"),
|
||||
])
|
||||
self.check_stuff_gets_embedded(
|
||||
nb, 'html_toc', to_be_included=['toc2'])
|
||||
|
||||
def check(byte_string, root_node):
|
||||
assert b'toc2' in byte_string
|
||||
|
||||
self.check_html(nb, 'html_toc', check_func=check)
|
||||
|
||||
@_with_tmp_cwd
|
||||
def test_html_collapsible_headings(self):
|
||||
@@ -84,5 +83,8 @@ class TestNbConvertExporters(TestsBase):
|
||||
v4.new_markdown_cell(source=('### level 3 heading')),
|
||||
v4.new_code_cell(source='a = range(1,10)'),
|
||||
])
|
||||
self.check_stuff_gets_embedded(
|
||||
nb, 'html_ch', to_be_included=['collapsible_headings'])
|
||||
|
||||
def check(byte_string, root_node):
|
||||
assert b'collapsible_headings' in byte_string
|
||||
|
||||
self.check_html(nb, 'html_ch', check_func=check)
|
||||
|
||||
Reference in New Issue
Block a user