Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lenient file-type checking mode #10862 #10863

Merged
merged 9 commits into from
Jun 25, 2024
30 changes: 21 additions & 9 deletions arches/app/etl_modules/jsonld_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,27 @@ def read(self, request):
),
}

try:
self.read_zip_file(content, result)
except zipfile.BadZipFile as e:
return {
"success": False,
"data": FileValidationError(
message=e.args[0],
code=400,
),
}
Comment on lines +102 to +111
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noticed that in #10885 I forgot to surface this error in the UI where FILE_TYPE_CHECKING is false and we go straight to trying read a file as a zip file.


if not result["summary"]["files"]:
title = _("Invalid Uploaded File")
message = _(
"This file has missing information or invalid formatting. Make sure the file is complete and in the expected format."
)
return {"success": False, "data": {"title": title, "message": message}}

return {"success": True, "data": result}

def read_zip_file(self, content, result):
with zipfile.ZipFile(content, "r") as zip_ref:
files = zip_ref.infolist()
for file in files:
Expand Down Expand Up @@ -127,15 +148,6 @@ def read(self, request):

default_storage.save(destination, f)

if not result["summary"]["files"]:
title = _("Invalid Uploaded File")
message = _(
"This file has missing information or invalid formatting. Make sure the file is complete and in the expected format."
)
return {"success": False, "data": {"title": title, "message": message}}

return {"success": True, "data": result}

def validate_uploaded_file(self, file):
path = Path(file.name)
slug = path.parts[1]
Expand Down
18 changes: 13 additions & 5 deletions arches/app/utils/file_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,17 @@ def test_unknown_filetypes(self, file, extension=None):
errors = []
match extension:
case "DS_Store":
self.logger.log(
logging.WARN, "DS_Store file encountered, proceeding with caution."
)
case _ if extension not in settings.FILE_TYPES:
if settings.FILE_TYPE_CHECKING == "lenient":
self.logger.log(
logging.WARN,
"DS_Store file encountered, proceeding with caution.",
)
else:
errors.append(f"File type is not permitted: {extension}")
case _ if (
extension not in settings.FILE_TYPES
and (settings.FILE_TYPE_CHECKING != "lenient" or extension is not None)
):
errors.append(f"File type is not permitted: {extension}")
case "xlsx":
try:
Expand All @@ -46,7 +53,8 @@ def test_unknown_filetypes(self, file, extension=None):
except json.decoder.JSONDecodeError:
errors.append("Invalid json file")
case _:
errors.append("Cannot validate file")
if settings.FILE_TYPE_CHECKING != "lenient":
errors.append("Cannot validate file")

for error in errors:
self.logger.log(logging.ERROR, error)
Expand Down
14 changes: 14 additions & 0 deletions arches/apps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
import warnings

from django.conf import settings
from django.core.checks import register, Tags, Error, Warning

### GLOBAL DEPRECATIONS ###
FILE_TYPE_CHECKING_MSG = (
"Providing boolean values to FILE_TYPE_CHECKING is deprecated. "
"Starting with Arches 8.0, the only allowed options will be "
"None, 'lenient', and 'strict'."
)
if settings.FILE_TYPE_CHECKING in (True, False):
warnings.warn(FILE_TYPE_CHECKING_MSG, DeprecationWarning)


### SYSTEM CHECKS ###


@register(Tags.security)
def check_cache_backend_for_production(app_configs, **kwargs):
Expand Down
19 changes: 17 additions & 2 deletions arches/install/arches-templates/project_name/settings.py-tpl
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,23 @@ SEARCH_COMPONENT_LOCATIONS.append('{{ project_name }}.search_components')

LOCALE_PATHS.insert(0, os.path.join(APP_ROOT, 'locale'))

FILE_TYPE_CHECKING = False
FILE_TYPES = ["bmp", "gif", "jpg", "jpeg", "json", "pdf", "png", "psd", "rtf", "tif", "tiff", "xlsx", "csv", "zip"]
FILE_TYPE_CHECKING = "lenient"
FILE_TYPES = [
"bmp",
"gif",
"jpg",
"jpeg",
"json",
"pdf",
"png",
"psd",
"rtf",
"tif",
"tiff",
"xlsx",
"csv",
"zip",
]
FILENAME_GENERATOR = "arches.app.utils.storage_filename_generator.generate_filename"
UPLOADED_FILES_DIR = "uploadedfiles"

Expand Down
2 changes: 1 addition & 1 deletion arches/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@
"arches.app.etl_modules",
]

FILE_TYPE_CHECKING = False
FILE_TYPE_CHECKING = "lenient"
FILE_TYPES = [
"bmp",
"gif",
Expand Down
24 changes: 23 additions & 1 deletion releases/7.6.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Arches 7.6.0 Release Notes
- 9769 Ensure resource creation edit log timestamps precede resource update timestamps
- 10738 Adds Github action for comparing test coverage between branches and rejecting branches that lower test coverage
- 10842 Adds project-level testing and GitHub test runners
- 10862 Lenient file-type checking mode
- 10699 Allow overriding search_results view
- 10911 Styling fix in resource model manage menu
- 10726 Upgrade openpyxl package to 3.1.2 and fixes ETL modules
Expand Down Expand Up @@ -155,6 +156,15 @@ Minor incompatibilities:
is now a more attractive target for overriding than `run_load_task()`.


### Deprecations

- Boolean values for the `FILE_TYPE_CHECKING` setting are deprecated. Starting with Arches 8.0, the allowed values will be:
- `None`
- `"lenient"`
- `"strict"`

For more, see the [documentation]() for this setting.

### Upgrading Arches

1. You must be upgraded to at least version 7.5.0 before proceeding. If you are on an earlier version, please refer to the upgrade process in the [Version 7.5.0 release notes](https://github.com/archesproject/arches/blob/dev/7.5.x/releases/7.5.0.md)
Expand Down Expand Up @@ -355,7 +365,19 @@ Minor incompatibilities:
- `npm run build_production` This builds a production bundle. **takes up to 2hrs depending on resources**
- Alternatively you can run `python manage.py build_production`. This will create a production bundle of frontend assessts and also call `collectstatic`.

16. If you are running Arches on Apache, be sure to run:
16. Test your project with deprecation warnings enabled.

```
python -Wall::DeprecationWarning manage.py test --settings=tests.test_settings
```

17. Run system checks against your production settings.

```
python manage.py check --deploy --settings=path.to.production.settings
```

18. If you are running Arches on Apache, be sure to run:

```
collectstatic
Expand Down
2 changes: 2 additions & 0 deletions tests/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
"arches.W001"
) # Cache backend does not support rate-limiting

FILE_TYPE_CHECKING = "lenient"

# could add Chrome, PhantomJS etc... here
LOCAL_BROWSERS = [] # ['Firefox']

Expand Down
149 changes: 149 additions & 0 deletions tests/utils/test_file_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import os
import shutil
from pathlib import Path

from unittest.mock import Mock, patch

from django.conf import settings
from django.test import TestCase
from django.test.utils import override_settings

from arches.app.utils.file_validator import FileValidator

# these tests can be run from the command line via
# python manage.py test tests.utils.test_file_validator.FileValidatorTests --settings="tests.test_settings"


class MockFile:
@staticmethod
def read():
"""Return a jagged csv file (invalid row length)"""
return b"col1,col2\ndata1"

@staticmethod
def seek(offset):
return


class MockFileType:
def __init__(self, extension):
self.extension = extension


class FileValidatorTests(TestCase):
"""FILE_TYPE_CHECKING defaults to 'lenient': overridden as necessary."""

validator = FileValidator()
mock_file = MockFile()

@override_settings(FILE_TYPE_CHECKING=None)
def test_no_file_checking(self):
errors = self.validator.validate_file_type(self.mock_file)
self.assertEqual(errors, [])

@patch("filetype.guess", Mock(return_value=None))
def test_check_unknown_filetype_lenient(self):
errors = self.validator.validate_file_type(self.mock_file)
self.assertEqual(errors, [])

@patch("filetype.guess", Mock(return_value=None))
@override_settings(FILE_TYPE_CHECKING="strict")
def test_check_unknown_filetype_strict(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(self.mock_file)
self.assertEqual(errors, ["File type is not permitted: None"])

@patch("filetype.guess", Mock(return_value=MockFileType("suspicious")))
def test_filetype_not_listed(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(self.mock_file)
self.assertEqual(errors, ["Unsafe file type suspicious"])

@patch("filetype.guess", Mock(return_value=None))
def test_check_invalid_csv(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(self.mock_file, extension="csv")
self.assertEqual(errors, ["Invalid csv file"])

@patch("filetype.guess", Mock(return_value=None))
def test_check_invalid_json(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(self.mock_file, extension="json")
self.assertEqual(errors, ["Invalid json file"])

@patch("filetype.guess", Mock(return_value=None))
def test_check_invalid_jpeg_lenient(self):
errors = self.validator.validate_file_type(self.mock_file, extension="jpeg")
self.assertEqual(errors, [])

@patch("filetype.guess", Mock(return_value=None))
@override_settings(FILE_TYPE_CHECKING="strict")
def test_check_invalid_jpeg_strict(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(self.mock_file, extension="jpeg")
self.assertEqual(errors, ["Cannot validate file"])

@patch("filetype.guess", Mock(return_value=None))
def test_check_invalid_jpeg_lenient(self):
errors = self.validator.validate_file_type(self.mock_file, extension="jpeg")
self.assertEqual(errors, [])

@patch("filetype.guess", Mock(return_value=None))
@override_settings(FILE_TYPE_CHECKING="strict")
def test_check_invalid_but_not_in_listed_types(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(
self.mock_file, extension="notlisted"
)
self.assertEqual(errors, ["File type is not permitted: notlisted"])

@patch("filetype.guess", Mock(return_value=None))
def test_check_dsstore_lenient(self):
"""In lenient mode, we assume these might be present in .zip files."""
with self.assertLogs("arches.app.utils.file_validator", level="WARN"):
errors = self.validator.validate_file_type(
self.mock_file, extension="DS_Store"
)
self.assertEqual(errors, [])

@patch("filetype.guess", Mock(return_value=None))
@override_settings(FILE_TYPE_CHECKING="strict")
def test_check_dsstore_strict(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(
self.mock_file, extension="DS_Store"
)
self.assertEqual(errors, ["File type is not permitted: DS_Store"])

@patch("filetype.guess", Mock(return_value=None))
@patch("arches.app.utils.file_validator.load_workbook", lambda noop: None)
def test_valid_xlsx(self):
errors = self.validator.validate_file_type(self.mock_file, extension="xlsx")
self.assertEqual(errors, [])

@patch("filetype.guess", Mock(return_value=None))
def test_invalid_xlsx(self):
with self.assertLogs("arches.app.utils.file_validator", level="ERROR"):
errors = self.validator.validate_file_type(self.mock_file, extension="xlsx")
self.assertEqual(errors, ["Invalid xlsx workbook"])

def test_zip(self):
# Zip up the files in the tests/fixtures/uploadedfiles dir
# Currently, contains a single .csv file and an empty dir.
dir_to_zip = Path(settings.MEDIA_ROOT) / "uploadedfiles"
zip_file_path = shutil.make_archive("test", "zip", dir_to_zip, dir_to_zip)
self.addCleanup(os.unlink, zip_file_path)

with open(zip_file_path, "rb") as file:
errors = self.validator.validate_file_type(file)
self.assertEqual(errors, [])

with (
open(zip_file_path, "rb") as file,
self.modify_settings(FILE_TYPES={"remove": "csv"}),
self.assertLogs("arches.app.utils.file_validator", level="ERROR"),
):
errors = self.validator.validate_file_type(file)
self.assertEqual(
errors, ["File type is not permitted: csv", "Unsafe zip file contents"]
)
Loading