-
Notifications
You must be signed in to change notification settings - Fork 572
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ExtractAttachmentsPreprocessor (#1978)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steven Silvester <[email protected]>
- Loading branch information
1 parent
b56c272
commit 27804f1
Showing
10 changed files
with
258 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
""" | ||
Module that extracts attachments from notebooks into their own files | ||
""" | ||
|
||
# Copyright (c) Jupyter Development Team. | ||
# Distributed under the terms of the Modified BSD License. | ||
|
||
import os | ||
from base64 import b64decode | ||
|
||
from traitlets import Bool, Unicode | ||
|
||
from .base import Preprocessor | ||
|
||
|
||
class ExtractAttachmentsPreprocessor(Preprocessor): | ||
""" | ||
Extracts attachments from all (markdown and raw) cells in a notebook. | ||
The extracted attachments are stored in a directory ('attachments' by default). | ||
https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments | ||
""" | ||
|
||
attachments_directory_template = Unicode( | ||
"{notebook_name}_attachments", | ||
help="Directory to place attachments if use_separate_dir is True", | ||
).tag(config=True) | ||
|
||
use_separate_dir = Bool( | ||
False, | ||
help="Whether to use output_files_dir (which ExtractOutput also uses) or " | ||
"create a separate directory for attachments", | ||
).tag(config=True) | ||
|
||
def __init__(self, **kw): | ||
""" | ||
Public constructor | ||
""" | ||
super().__init__(**kw) | ||
# directory path, | ||
self.path_name = "" # will be set in self.preprocess, needs resources | ||
# Where extracted attachments are stored in resources | ||
self.resources_item_key = ( | ||
"attachments" # Here as a default, in case someone doesn't want to call preprocess | ||
) | ||
|
||
# Add condition and configurability here | ||
def preprocess(self, nb, resources): | ||
""" | ||
Determine some settings and apply preprocessor to notebook | ||
""" | ||
if self.use_separate_dir: | ||
self.path_name = self.attachments_directory_template.format( | ||
notebook_name=resources["unique_key"] | ||
) | ||
# Initialize resources for attachments | ||
resources["attachment_files_dir"] = self.path_name | ||
resources["attachments"] = {} | ||
self.resources_item_key = "attachments" | ||
else: | ||
# Use same resources as ExtractOutput | ||
self.path_name = resources["output_files_dir"] | ||
self.resources_item_key = "outputs" | ||
|
||
# Make sure key exists | ||
if not isinstance(resources[self.resources_item_key], dict): | ||
resources[self.resources_item_key] = {} | ||
|
||
nb, resources = super().preprocess(nb, resources) | ||
return nb, resources | ||
|
||
def preprocess_cell(self, cell, resources, index): | ||
""" | ||
Extract attachments to individual files and | ||
change references to them. | ||
E.g. | ||
'![image.png](attachment:021fdd80.png)' | ||
becomes | ||
'![image.png]({path_name}/021fdd80.png)' | ||
Assumes self.path_name and self.resources_item_key is set properly (usually in preprocess). | ||
""" | ||
if "attachments" in cell: | ||
for fname in cell.attachments: | ||
self.log.debug(f"Encountered attachment {fname}") | ||
|
||
# Add file for writer | ||
|
||
# Right now I don't know of a situation where there would be multiple | ||
# mime types under same filename, and I can't index into it without the mimetype. | ||
# So I only read the first one. | ||
for mimetype in cell.attachments[fname]: | ||
# convert to bytes and decode | ||
data = cell.attachments[fname][mimetype].encode("utf-8") | ||
decoded = b64decode(data) | ||
break | ||
|
||
# FilesWriter wants path to be in attachment filename here | ||
new_filename = os.path.join(self.path_name, fname) | ||
resources[self.resources_item_key][new_filename] = decoded | ||
|
||
# Edit the reference to the attachment | ||
|
||
# os.path.join on windows uses "\\" separator, | ||
# but files like markdown still want "/" | ||
if os.path.sep != "/": | ||
new_filename = new_filename.replace(os.path.sep, "/") | ||
cell.source = cell.source.replace("attachment:" + fname, new_filename) | ||
|
||
return cell, resources |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
"""Tests for the ExtractAttachments preprocessor""" | ||
|
||
# Copyright (c) IPython Development Team. | ||
# Distributed under the terms of the Modified BSD License. | ||
|
||
import os | ||
from base64 import b64decode | ||
|
||
from ..extractattachments import ExtractAttachmentsPreprocessor | ||
from .base import PreprocessorTestsBase | ||
|
||
|
||
class TestExtractAttachments(PreprocessorTestsBase): | ||
"""Contains test functions for extractattachments.py""" | ||
|
||
def build_preprocessor(self): | ||
"""Make an instance of a preprocessor""" | ||
preprocessor = ExtractAttachmentsPreprocessor() | ||
preprocessor.enabled = True | ||
return preprocessor | ||
|
||
def test_constructor(self): | ||
"""Can a ExtractAttachmentsPreprocessor be constructed?""" | ||
self.build_preprocessor() | ||
|
||
def test_attachment(self): | ||
"""Test the output of the ExtractAttachmentsPreprocessor""" | ||
nb = self.build_notebook(with_attachment=True) | ||
res = self.build_resources() | ||
preprocessor = self.build_preprocessor() | ||
nb, res = preprocessor(nb, res) | ||
|
||
# Check if attachment was extracted. | ||
attachments = nb.cells[-1].attachments | ||
self.assertIn("image.png", attachments) | ||
self.assertIn("image/png", attachments["image.png"]) | ||
data = attachments["image.png"]["image/png"] | ||
# convert to bytes, b64 decode, convert to str | ||
data = b64decode(data.encode("utf-8")) | ||
self.assertEqual(data, b"test") | ||
|
||
# Verify attachment | ||
self.assertIn("image.png", res["outputs"]) | ||
self.assertEqual(res["outputs"]["image.png"], b"test") | ||
|
||
# Verify cell source changed appropriately | ||
src = nb.cells[-1].source | ||
self.assertEqual(src, "![image.png](image.png)") | ||
|
||
def test_attachment_with_directory(self): | ||
"""Test that cell source modifications are correct when files are put in a directory""" | ||
nb = self.build_notebook(with_attachment=True) | ||
res = self.build_resources() | ||
output_dir = "outputs" | ||
res["output_files_dir"] = output_dir | ||
preprocessor = self.build_preprocessor() | ||
nb, res = preprocessor(nb, res) | ||
|
||
# Verify attachment | ||
# This can have "\\" separator on Windows | ||
file_path = os.path.join("outputs", "image.png") | ||
self.assertIn(file_path, res["outputs"]) | ||
|
||
# Verify cell source changed appropriately | ||
src = nb.cells[-1].source | ||
# This shouldn't change on Windows | ||
self.assertEqual(src, "![image.png](outputs/image.png)") | ||
|
||
def test_use_separate_dir_config(self): | ||
"""Test that use_separate_dir and attachment_directory_template work properly""" | ||
nb = self.build_notebook(with_attachment=True) | ||
res = self.build_resources() | ||
res["unique_key"] = "notebook1" # add notebook name for the folder | ||
preprocessor = self.build_preprocessor() | ||
preprocessor.use_separate_dir = True | ||
preprocessor.attachments_directory_template = "{notebook_name}_custom" | ||
nb, res = preprocessor(nb, res) | ||
|
||
# Verify attachment | ||
# This can have "\\" separator on Windows | ||
file_path = os.path.join("notebook1_custom", "image.png") | ||
self.assertIn(file_path, res["attachments"]) | ||
|
||
# Verify cell source changed appropriately | ||
src = nb.cells[-1].source | ||
# This shouldn't change on Windows | ||
self.assertEqual(src, "![image.png](notebook1_custom/image.png)") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.