From 27804f13185bcf1d55ab8cde7ae00dbd4a1e6a5b Mon Sep 17 00:00:00 2001 From: tuncbkose <48298909+tuncbkose@users.noreply.github.com> Date: Sat, 6 May 2023 17:13:49 +0300 Subject: [PATCH] Add ExtractAttachmentsPreprocessor (#1978) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steven Silvester --- nbconvert/exporters/exporter.py | 1 + nbconvert/exporters/latex.py | 1 + nbconvert/exporters/markdown.py | 1 + nbconvert/exporters/pdf.py | 3 +- nbconvert/preprocessors/__init__.py | 1 + nbconvert/preprocessors/extractattachments.py | 108 ++++++++++++++++++ nbconvert/preprocessors/tests/base.py | 17 ++- .../tests/test_extractattachments.py | 87 ++++++++++++++ nbconvert/writers/files.py | 34 ++++-- nbconvert/writers/tests/test_files.py | 25 ++-- 10 files changed, 258 insertions(+), 20 deletions(-) create mode 100644 nbconvert/preprocessors/extractattachments.py create mode 100644 nbconvert/preprocessors/tests/test_extractattachments.py diff --git a/nbconvert/exporters/exporter.py b/nbconvert/exporters/exporter.py index 5023d59bf..f833a344e 100644 --- a/nbconvert/exporters/exporter.py +++ b/nbconvert/exporters/exporter.py @@ -95,6 +95,7 @@ class Exporter(LoggingConfigurable): "nbconvert.preprocessors.LatexPreprocessor", "nbconvert.preprocessors.HighlightMagicsPreprocessor", "nbconvert.preprocessors.ExtractOutputPreprocessor", + "nbconvert.preprocessors.ExtractAttachmentsPreprocessor", "nbconvert.preprocessors.ClearMetadataPreprocessor", ], help="""List of preprocessors available by default, by name, namespace, diff --git a/nbconvert/exporters/latex.py b/nbconvert/exporters/latex.py index 681323f4b..f1e6272c7 100644 --- a/nbconvert/exporters/latex.py +++ b/nbconvert/exporters/latex.py @@ -54,6 +54,7 @@ def default_config(self): "text/plain", ] }, + "ExtractAttachmentsPreprocessor": {"enabled": True}, "ExtractOutputPreprocessor": {"enabled": True}, "SVG2PDFPreprocessor": {"enabled": True}, "LatexPreprocessor": {"enabled": True}, diff --git a/nbconvert/exporters/markdown.py b/nbconvert/exporters/markdown.py index 28041077e..bdf6c2544 100644 --- a/nbconvert/exporters/markdown.py +++ b/nbconvert/exporters/markdown.py @@ -34,6 +34,7 @@ def _raw_mimetypes_default(self): def default_config(self): c = Config( { + "ExtractAttachmentsPreprocessor": {"enabled": True}, "ExtractOutputPreprocessor": {"enabled": True}, "NbConvertBase": { "display_data_priority": [ diff --git a/nbconvert/exporters/pdf.py b/nbconvert/exporters/pdf.py index cfcfe424a..db5f120de 100644 --- a/nbconvert/exporters/pdf.py +++ b/nbconvert/exporters/pdf.py @@ -210,8 +210,9 @@ def from_notebook_node(self, nb, resources=None, **kw): # convert output extension to pdf # the writer above required it to be tex resources["output_extension"] = ".pdf" - # clear figure outputs, extracted by latex export, + # clear figure outputs and attachments, extracted by latex export, # so we don't claim to be a multi-file export. resources.pop("outputs", None) + resources.pop("attachments", None) return pdf_data, resources diff --git a/nbconvert/preprocessors/__init__.py b/nbconvert/preprocessors/__init__.py index 5f3d88943..a13443d92 100644 --- a/nbconvert/preprocessors/__init__.py +++ b/nbconvert/preprocessors/__init__.py @@ -11,6 +11,7 @@ from .convertfigures import ConvertFiguresPreprocessor from .csshtmlheader import CSSHTMLHeaderPreprocessor from .execute import ExecutePreprocessor +from .extractattachments import ExtractAttachmentsPreprocessor from .extractoutput import ExtractOutputPreprocessor from .highlightmagics import HighlightMagicsPreprocessor from .latex import LatexPreprocessor diff --git a/nbconvert/preprocessors/extractattachments.py b/nbconvert/preprocessors/extractattachments.py new file mode 100644 index 000000000..763e614f5 --- /dev/null +++ b/nbconvert/preprocessors/extractattachments.py @@ -0,0 +1,108 @@ +""" +Module that extracts attachments from notebooks into their own files +""" + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +import os +from base64 import b64decode + +from traitlets import Bool, Unicode + +from .base import Preprocessor + + +class ExtractAttachmentsPreprocessor(Preprocessor): + """ + Extracts attachments from all (markdown and raw) cells in a notebook. + The extracted attachments are stored in a directory ('attachments' by default). + https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments + """ + + attachments_directory_template = Unicode( + "{notebook_name}_attachments", + help="Directory to place attachments if use_separate_dir is True", + ).tag(config=True) + + use_separate_dir = Bool( + False, + help="Whether to use output_files_dir (which ExtractOutput also uses) or " + "create a separate directory for attachments", + ).tag(config=True) + + def __init__(self, **kw): + """ + Public constructor + """ + super().__init__(**kw) + # directory path, + self.path_name = "" # will be set in self.preprocess, needs resources + # Where extracted attachments are stored in resources + self.resources_item_key = ( + "attachments" # Here as a default, in case someone doesn't want to call preprocess + ) + + # Add condition and configurability here + def preprocess(self, nb, resources): + """ + Determine some settings and apply preprocessor to notebook + """ + if self.use_separate_dir: + self.path_name = self.attachments_directory_template.format( + notebook_name=resources["unique_key"] + ) + # Initialize resources for attachments + resources["attachment_files_dir"] = self.path_name + resources["attachments"] = {} + self.resources_item_key = "attachments" + else: + # Use same resources as ExtractOutput + self.path_name = resources["output_files_dir"] + self.resources_item_key = "outputs" + + # Make sure key exists + if not isinstance(resources[self.resources_item_key], dict): + resources[self.resources_item_key] = {} + + nb, resources = super().preprocess(nb, resources) + return nb, resources + + def preprocess_cell(self, cell, resources, index): + """ + Extract attachments to individual files and + change references to them. + E.g. + '![image.png](attachment:021fdd80.png)' + becomes + '![image.png]({path_name}/021fdd80.png)' + Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess). + """ + if "attachments" in cell: + for fname in cell.attachments: + self.log.debug(f"Encountered attachment {fname}") + + # Add file for writer + + # Right now I don't know of a situation where there would be multiple + # mime types under same filename, and I can't index into it without the mimetype. + # So I only read the first one. + for mimetype in cell.attachments[fname]: + # convert to bytes and decode + data = cell.attachments[fname][mimetype].encode("utf-8") + decoded = b64decode(data) + break + + # FilesWriter wants path to be in attachment filename here + new_filename = os.path.join(self.path_name, fname) + resources[self.resources_item_key][new_filename] = decoded + + # Edit the reference to the attachment + + # os.path.join on windows uses "\\" separator, + # but files like markdown still want "/" + if os.path.sep != "/": + new_filename = new_filename.replace(os.path.sep, "/") + cell.source = cell.source.replace("attachment:" + fname, new_filename) + + return cell, resources diff --git a/nbconvert/preprocessors/tests/base.py b/nbconvert/preprocessors/tests/base.py index d2bcc7769..ba0b63517 100644 --- a/nbconvert/preprocessors/tests/base.py +++ b/nbconvert/preprocessors/tests/base.py @@ -3,6 +3,8 @@ # Copyright (c) IPython Development Team. # Distributed under the terms of the Modified BSD License. +from base64 import b64encode + from nbformat import v4 as nbformat from ...exporters.exporter import ResourcesDict @@ -12,7 +14,7 @@ class PreprocessorTestsBase(TestsBase): """Contains test functions preprocessor tests""" - def build_notebook(self, with_json_outputs=False): + def build_notebook(self, with_json_outputs=False, with_attachment=False): """Build a notebook in memory for use with preprocessor tests""" outputs = [ @@ -42,6 +44,19 @@ def build_notebook(self, with_json_outputs=False): nbformat.new_markdown_cell(source="$ e $"), ] + if with_attachment: + data = b"test" + encoded_data = b64encode(data) + # this is conversion of bytes to string, not base64 decoding + attachments = {"image.png": {"image/png": encoded_data.decode()}} + cells.extend( + [ + nbformat.new_markdown_cell( + source="![image.png](attachment:image.png)", attachments=attachments + ) + ] + ) + return nbformat.new_notebook(cells=cells) def build_resources(self): diff --git a/nbconvert/preprocessors/tests/test_extractattachments.py b/nbconvert/preprocessors/tests/test_extractattachments.py new file mode 100644 index 000000000..e709b9ef8 --- /dev/null +++ b/nbconvert/preprocessors/tests/test_extractattachments.py @@ -0,0 +1,87 @@ +"""Tests for the ExtractAttachments preprocessor""" + +# Copyright (c) IPython Development Team. +# Distributed under the terms of the Modified BSD License. + +import os +from base64 import b64decode + +from ..extractattachments import ExtractAttachmentsPreprocessor +from .base import PreprocessorTestsBase + + +class TestExtractAttachments(PreprocessorTestsBase): + """Contains test functions for extractattachments.py""" + + def build_preprocessor(self): + """Make an instance of a preprocessor""" + preprocessor = ExtractAttachmentsPreprocessor() + preprocessor.enabled = True + return preprocessor + + def test_constructor(self): + """Can a ExtractAttachmentsPreprocessor be constructed?""" + self.build_preprocessor() + + def test_attachment(self): + """Test the output of the ExtractAttachmentsPreprocessor""" + nb = self.build_notebook(with_attachment=True) + res = self.build_resources() + preprocessor = self.build_preprocessor() + nb, res = preprocessor(nb, res) + + # Check if attachment was extracted. + attachments = nb.cells[-1].attachments + self.assertIn("image.png", attachments) + self.assertIn("image/png", attachments["image.png"]) + data = attachments["image.png"]["image/png"] + # convert to bytes, b64 decode, convert to str + data = b64decode(data.encode("utf-8")) + self.assertEqual(data, b"test") + + # Verify attachment + self.assertIn("image.png", res["outputs"]) + self.assertEqual(res["outputs"]["image.png"], b"test") + + # Verify cell source changed appropriately + src = nb.cells[-1].source + self.assertEqual(src, "![image.png](image.png)") + + def test_attachment_with_directory(self): + """Test that cell source modifications are correct when files are put in a directory""" + nb = self.build_notebook(with_attachment=True) + res = self.build_resources() + output_dir = "outputs" + res["output_files_dir"] = output_dir + preprocessor = self.build_preprocessor() + nb, res = preprocessor(nb, res) + + # Verify attachment + # This can have "\\" separator on Windows + file_path = os.path.join("outputs", "image.png") + self.assertIn(file_path, res["outputs"]) + + # Verify cell source changed appropriately + src = nb.cells[-1].source + # This shouldn't change on Windows + self.assertEqual(src, "![image.png](outputs/image.png)") + + def test_use_separate_dir_config(self): + """Test that use_separate_dir and attachment_directory_template work properly""" + nb = self.build_notebook(with_attachment=True) + res = self.build_resources() + res["unique_key"] = "notebook1" # add notebook name for the folder + preprocessor = self.build_preprocessor() + preprocessor.use_separate_dir = True + preprocessor.attachments_directory_template = "{notebook_name}_custom" + nb, res = preprocessor(nb, res) + + # Verify attachment + # This can have "\\" separator on Windows + file_path = os.path.join("notebook1_custom", "image.png") + self.assertIn(file_path, res["attachments"]) + + # Verify cell source changed appropriately + src = nb.cells[-1].source + # This shouldn't change on Windows + self.assertEqual(src, "![image.png](notebook1_custom/image.png)") diff --git a/nbconvert/writers/files.py b/nbconvert/writers/files.py index fb63b0285..852496c19 100644 --- a/nbconvert/writers/files.py +++ b/nbconvert/writers/files.py @@ -51,6 +51,19 @@ def _makedir(self, path): self.log.info("Making directory %s", path) ensure_dir_exists(path) + def _write_items(self, items, build_dir): + """Write a dict containing filename->binary data""" + for filename, data in items: + # Determine where to write the file to + dest = os.path.join(build_dir, filename) + path = os.path.dirname(dest) + self._makedir(path) + + # Write file + self.log.debug("Writing %i bytes to %s", len(data), dest) + with open(dest, "wb") as f: + f.write(data) + def write(self, output, resources, notebook_name=None, **kw): """ Consume and write Jinja output to the file system. Output directory @@ -73,7 +86,7 @@ def write(self, output, resources, notebook_name=None, **kw): relpath = self.relpath or resource_path build_directory = self.build_directory or resource_path - # Write all of the extracted resources to the destination directory. + # Write the extracted outputs to the destination directory. # NOTE: WE WRITE EVERYTHING AS-IF IT'S BINARY. THE EXTRACT FIG # PREPROCESSOR SHOULD HANDLE UNIX/WINDOWS LINE ENDINGS... @@ -83,16 +96,17 @@ def write(self, output, resources, notebook_name=None, **kw): "Support files will be in %s", os.path.join(resources.get("output_files_dir", ""), ""), ) - for filename, data in items: - # Determine where to write the file to - dest = os.path.join(build_directory, filename) - path = os.path.dirname(dest) - self._makedir(path) + self._write_items(items, build_directory) - # Write file - self.log.debug("Writing %i bytes to support file %s", len(data), dest) - with open(dest, "wb") as f: - f.write(data) + # Write the extracted attachments + # if ExtractAttachmentsOutput specified a separate directory + attachs = resources.get("attachments", {}).items() + if attachs: + self.log.info( + "Attachments will be in %s", + os.path.join(resources.get("attachment_files_dir", ""), ""), + ) + self._write_items(attachs, build_directory) # Copy referenced files to output directory if build_directory: diff --git a/nbconvert/writers/tests/test_files.py b/nbconvert/writers/tests/test_files.py index fcc0bf15d..bca0e59b3 100644 --- a/nbconvert/writers/tests/test_files.py +++ b/nbconvert/writers/tests/test_files.py @@ -19,7 +19,7 @@ def test_basic_output(self): # Work in a temporary directory. with self.create_temp_cwd(): - # Create the resoruces dictionary + # Create the resources dictionary res: dict = {} # Create files writer, test output @@ -54,8 +54,11 @@ def test_extract(self): # Work in a temporary directory. with self.create_temp_cwd(): - # Create the resoruces dictionary - res = {"outputs": {os.path.join("z_files", "a"): b"b"}} + # Create the resources dictionary + res = { + "outputs": {os.path.join("z_files", "a"): b"b"}, + "attachments": {os.path.join("z_attachments", "c"): b"d"}, + } # Create files writer, test output writer = FilesWriter() @@ -73,12 +76,18 @@ def test_extract(self): output = f.read() self.assertEqual(output, "b") + attachment_file_dest = os.path.join("z_attachments", "c") + assert os.path.isfile(attachment_file_dest) + with open(attachment_file_dest) as f: + content = f.read() + self.assertEqual(content, "d") + def test_build_dir(self): """Can FilesWriter write to a build dir correctly?""" # Work in a temporary directory. with self.create_temp_cwd(): - # Create the resoruces dictionary + # Create the resources dictionary res = {"outputs": {os.path.join("z_files", "a"): b"b"}} # Create files writer, test output @@ -122,7 +131,7 @@ def test_links(self): with open(os.path.join("sub", "c"), "w") as f: f.write("d") - # Create the resoruces dictionary + # Create the resources dictionary res: dict = {} # Create files writer, test output @@ -195,7 +204,7 @@ def test_relpath(self): with open(os.path.join("sub", "c"), "w") as f: f.write("d") - # Create the resoruces dictionary + # Create the resources dictionary res: dict = {} # Create files writer, test output @@ -229,7 +238,7 @@ def test_relpath_default(self): with open(os.path.join("sub", "c"), "w") as f: f.write("d") - # Create the resoruces dictionary + # Create the resources dictionary res = {"metadata": {"path": "sub"}} # Create files writer, test output @@ -262,7 +271,7 @@ def test_relpath_precedence(self): with open(os.path.join("sub", "c"), "w") as f: f.write("d") - # Create the resoruces dictionary + # Create the resources dictionary res = {"metadata": {"path": "other_sub"}} # Create files writer, test output