diff --git a/.gitignore b/.gitignore index 20ebd7e..421851c 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ output/* .coverage coverage.* +myenv/ dist/* */*.egg-info/* diff --git a/README.md b/README.md index 70afbee..f505367 100644 --- a/README.md +++ b/README.md @@ -15,30 +15,56 @@ pip install jp2_remediator==0.0.2 ## Usage -## Process one file -`python3 main.py --file tests/test-images/7514499.jp2` +```bash +python3 src/jp2_remediator/main.py -h + +usage: main.py [-h] {file,directory,bucket} ... + +JP2 file processor -`python3 main.py --file tests/test-images/481014278.jp2` +options: + -h, --help show this help message and exit -## Process directory -`python3 main.py --directory tests/test-images/` +Input source: + {file,directory,bucket} + file Process a single JP2 file + directory Process all JP2 files in a directory + bucket Process all JP2 files in an S3 bucket +``` -## Process Amazon S3 bucket -`python3 main.py --bucket your-bucket-name --prefix optional-prefix` +### Process one file +```bash +python3 src/jp2_remediator/main.py file tests/test-images/7514499.jp2 -## Process all .jp2 files in the bucket: -`python3 main.py --bucket remediation-folder` +python3 src/jp2_remediator/main.py file tests/test-images/481014278.jp2 +``` + +### Process directory +```bash +python3 src/jp2_remediator/main.py directory tests/test-images/ +``` + +### Process all .jp2 files in an S3 bucket: +```bash +python3 src/jp2_remediator/main.py bucket remediation-folder +``` -## Process only files with a specific prefix (folder): -`python3 main.py --bucket remediation-folder --prefix testbatch_20240923` +### Process only files with a specific prefix (folder): +```bash +python3 src/jp2_remediator/main.py bucket remediation-folder --prefix testbatch_20240923` +``` -`python3 main.py --help` +## Run tests -## Run Tests -`python3 test_aws_connection.py` +### Run integration tests +```bash +pytest src/jp2_remediator/tests/integration/ +``` -### Run from src folder -`python3 -m unittest jp2_remediator.tests.unit.test_box_reader` +### Run unit tests +```bash +pytest src/jp2_remediator/tests/unit/ +``` ## Docker environment @@ -51,3 +77,13 @@ Start Docker container ```bash ./bin/docker-run.sh ``` + +## Development environment +```bash +python3 -m venv myenv +source myenv/bin/activate +export PYTHONPATH="${PYTHONPATH}:src" +pip install -r requirements.txt + +python src/jp2_remediator/main.py -h +``` diff --git a/src/jp2_remediator/box_reader.py b/src/jp2_remediator/box_reader.py index 602e26b..3d380ac 100644 --- a/src/jp2_remediator/box_reader.py +++ b/src/jp2_remediator/box_reader.py @@ -1,5 +1,3 @@ -import os -import boto3 import datetime from jp2_remediator import configure_logger from jpylyzer import boxvalidator @@ -169,7 +167,7 @@ def write_modified_file(self, new_file_contents): new_file.write(new_file_contents) self.logger.info(f"New JP2 file created with modifications: {new_file_path}") else: - self.logger.debug("No modifications needed. No new file created.") + self.logger.info(f"No modifications needed. No new file created: {self.file_path}") def read_jp2_file(self): # Main function to read, validate, and modify JP2 files. @@ -178,43 +176,9 @@ def read_jp2_file(self): self.initialize_validator() is_valid = self.validator._isValid() - self.logger.info("Is file valid?", is_valid) + self.logger.info(f"Is file valid? {is_valid}") header_offset_position = self.check_boxes() new_file_contents = self.process_all_trc_tags(header_offset_position) self.write_modified_file(new_file_contents) - - -def process_directory(directory_path): - # Process all JP2 files in a given directory. - for root, _, files in os.walk(directory_path): - for file in files: - if file.lower().endswith(".jp2"): - file_path = os.path.join(root, file) - print(f"Processing file: {file_path}") - reader = BoxReader(file_path) - reader.read_jp2_file() - - -def process_s3_bucket(bucket_name, prefix=""): - # Process all JP2 files in a given S3 bucket. - s3 = boto3.client("s3") - response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) - - if "Contents" in response: - for obj in response["Contents"]: - if obj["Key"].lower().endswith(".jp2"): - file_path = obj["Key"] - print(f"Processing file: {file_path} from bucket {bucket_name}") - download_path = f"/tmp/{os.path.basename(file_path)}" - s3.download_file(bucket_name, file_path, download_path) - reader = BoxReader(download_path) - reader.read_jp2_file() - # Optionally, upload modified file back to S3 - timestamp = datetime.datetime.now().strftime("%Y%m%d") # use "%Y%m%d_%H%M%S" for more precision - s3.upload_file( - download_path.replace(".jp2", f"_modified_{timestamp}.jp2"), - bucket_name, - file_path.replace(".jp2", f"_modified_{timestamp}.jp2"), - ) diff --git a/src/jp2_remediator/box_reader_factory.py b/src/jp2_remediator/box_reader_factory.py new file mode 100644 index 0000000..00c8a46 --- /dev/null +++ b/src/jp2_remediator/box_reader_factory.py @@ -0,0 +1,12 @@ +from jp2_remediator.box_reader import BoxReader + + +class BoxReaderFactory: + + def get_reader(self, file_path): + """ + Create a BoxReader instance for a given file path. + :param file_path: The path to the file to be read. + :return: A BoxReader instance. + """ + return BoxReader(file_path) diff --git a/src/jp2_remediator/main.py b/src/jp2_remediator/main.py index a64a55e..2ccf693 100644 --- a/src/jp2_remediator/main.py +++ b/src/jp2_remediator/main.py @@ -1,31 +1,62 @@ import argparse -from jp2_remediator.box_reader import BoxReader, process_directory, process_s3_bucket +from jp2_remediator.box_reader_factory import BoxReaderFactory +from jp2_remediator.processor import Processor def main(): + """Main entry point for the JP2 file processor.""" + processor = Processor(BoxReaderFactory()) + parser = argparse.ArgumentParser(description="JP2 file processor") - parser.add_argument("--file", help="Path to a single JP2 file to process.") - parser.add_argument( - "--directory", help="Path to a directory of JP2 files to process." + + # Create mutually exclusive subparsers for specifying input source + subparsers = parser.add_subparsers( + title="Input source", dest="input_source" + ) + + # Subparser for processing a single JP2 file + file_parser = subparsers.add_parser( + "file", help="Process a single JP2 file" + ) + file_parser.add_argument( + "file", help="Path to a single JP2 file to process" + ) + file_parser.set_defaults( + func=lambda args: processor.process_file(args.file) + ) + + # Subparser for processing all JP2 files in a directory + directory_parser = subparsers.add_parser( + "directory", help="Process all JP2 files in a directory" + ) + directory_parser.add_argument( + "directory", help="Path to a directory of JP2 files to process" + ) + directory_parser.set_defaults( + func=lambda args: processor.process_directory(args.directory) + ) + + # Subparser for processing all JP2 files in an S3 bucket + bucket_parser = subparsers.add_parser( + "bucket", help="Process all JP2 files in an S3 bucket" + ) + bucket_parser.add_argument( + "bucket", help="Name of the AWS S3 bucket to process JP2 files from" ) - parser.add_argument( - "--bucket", help="Name of the AWS S3 bucket to process JP2 files from." + bucket_parser.add_argument( + "--prefix", help="Prefix of files in the AWS S3 bucket (optional)", + default="" ) - parser.add_argument( - "--prefix", help="Prefix of files in the AWS S3 bucket (optional)." + bucket_parser.set_defaults( + func=lambda args: processor.process_s3_bucket(args.bucket, args.prefix) ) args = parser.parse_args() - if args.file: - reader = BoxReader(args.file) - reader.read_jp2_file() - elif args.directory: - process_directory(args.directory) - elif args.bucket: - process_s3_bucket(args.bucket, args.prefix) + if hasattr(args, "func"): + args.func(args) else: - print("Please specify either --file, --directory, or --bucket.") + parser.print_help() if __name__ == "__main__": diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py new file mode 100644 index 0000000..7727ad8 --- /dev/null +++ b/src/jp2_remediator/processor.py @@ -0,0 +1,55 @@ +import datetime +import os +import boto3 + + +class Processor: + """Class to process JP2 files.""" + + def __init__(self, factory): + """Initialize the Processor with a BoxReader factory.""" + self.box_reader_factory = factory + + def process_file(self, file_path): + """Process a single JP2 file.""" + print(f"Processing file: {file_path}") + reader = self.box_reader_factory.get_reader(file_path) + reader.read_jp2_file() + + def process_directory(self, directory_path): + """Process all JP2 files in a given directory.""" + for root, _, files in os.walk(directory_path): + for file in files: + if file.lower().endswith(".jp2"): + file_path = os.path.join(root, file) + print(f"Processing file: {file_path}") + reader = self.box_reader_factory.get_reader(file_path) + reader.read_jp2_file() + + def process_s3_bucket(self, bucket_name, prefix=""): + """Process all JP2 files in a given S3 bucket.""" + s3 = boto3.client("s3") + response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + + if "Contents" in response: + for obj in response["Contents"]: + if obj["Key"].lower().endswith(".jp2"): + file_path = obj["Key"] + print(f"""Processing file: {file_path} from bucket { + bucket_name + }""") + download_path = f"/tmp/{os.path.basename(file_path)}" + s3.download_file(bucket_name, file_path, download_path) + reader = self.box_reader_factory.get_reader(download_path) + reader.read_jp2_file() + # Optionally, upload modified file back to S3 + timestamp = datetime.datetime.now().strftime( + "%Y%m%d" + ) # use "%Y%m%d_%H%M%S" for more precision + s3.upload_file( + download_path.replace( + ".jp2", f"_modified_{timestamp}.jp2" + ), + bucket_name, + file_path.replace(".jp2", f"_modified_{timestamp}.jp2") + ) diff --git a/src/jp2_remediator/tests/unit/test_box_reader.py b/src/jp2_remediator/tests/unit/test_box_reader.py index ba1bf86..e100e21 100644 --- a/src/jp2_remediator/tests/unit/test_box_reader.py +++ b/src/jp2_remediator/tests/unit/test_box_reader.py @@ -1,7 +1,7 @@ import unittest import os from unittest.mock import patch, mock_open, MagicMock -from jp2_remediator.box_reader import BoxReader, process_directory, process_s3_bucket +from jp2_remediator.box_reader import BoxReader from jpylyzer import boxvalidator from project_paths import paths import datetime @@ -137,29 +137,6 @@ def test_process_all_trc_tags(self): ) self.assertEqual(modified_contents, self.reader.file_contents) - # Test for process_directory function - @patch("jp2_remediator.box_reader.BoxReader") - @patch("os.walk", return_value=[("root", [], ["file1.jp2", "file2.jp2"])]) - @patch("builtins.print") - def test_process_directory_with_multiple_files( - self, mock_print, mock_os_walk, mock_box_reader - ): - # Process a dir with multiple jp2 files - # Mock the logger for each BoxReader instance created - mock_box_reader.return_value.logger = MagicMock() - - # Call process_directory with a dummy path - process_directory("dummy_path") - - # Check that each JP2 file in the directory was processed - mock_print.assert_any_call("Processing file: root/file1.jp2") - mock_print.assert_any_call("Processing file: root/file2.jp2") - - # Ensure each BoxReader instance had its read_jp2_file method called - self.assertEqual( - mock_box_reader.return_value.read_jp2_file.call_count, 2 - ) - # Test for check_boxes method logging when 'jp2h' not found def test_jp2h_not_found_logging(self): # Set up file_contents to simulate a missing 'jp2h' box @@ -186,9 +163,10 @@ def test_write_modified_file_no_changes(self, mock_file): mock_file.assert_not_called() # Check that the specific debug message was logged - self.reader.logger.debug.assert_called_once_with( - "No modifications needed. No new file created." - ) + self.reader.logger.info.assert_called_once() + pattern = r"No modifications needed\. No new file created: .*sample\.jp2" + call_args = self.reader.logger.info.call_args + self.assertRegex(call_args[0][0], pattern) # Test for process_colr_box method when meth_value == 1 def test_process_colr_box_meth_value_1(self): @@ -316,7 +294,7 @@ def test_read_jp2_file(self): mock_validator._isValid.assert_called_once() # Assert that logger.info was called with correct parameters - self.reader.logger.info.assert_called_with("Is file valid?", True) + self.reader.logger.info.assert_called_with("Is file valid? True") # Assert that check_boxes was called once mock_check_boxes.assert_called_once() @@ -347,82 +325,6 @@ def test_read_jp2_file_no_file_contents(self): mock_process_all_trc_tags.assert_not_called() mock_write_modified_file.assert_not_called() - # Test for process_s3_bucket function - @patch("jp2_remediator.box_reader.boto3.client") - @patch("jp2_remediator.box_reader.BoxReader") - @patch("builtins.print") - def test_process_s3_bucket(self, mock_print, mock_box_reader, mock_boto3_client): - # Set up the mock S3 client - mock_s3_client = MagicMock() - mock_boto3_client.return_value = mock_s3_client - - # Define the bucket name and prefix - bucket_name = "test-bucket" - prefix = "test-prefix" - - # Prepare a fake response for list_objects_v2 - mock_s3_client.list_objects_v2.return_value = { - "Contents": [ - {"Key": "file1.jp2"}, - {"Key": "file2.jp2"}, - {"Key": "file3.txt"}, # Non-JP2 file to test filtering - ] - } - - # Mock download_file and upload_file methods - mock_s3_client.download_file.return_value = None - mock_s3_client.upload_file.return_value = None - - # Mock BoxReader instance and its read_jp2_file method - mock_reader_instance = MagicMock() - mock_box_reader.return_value = mock_reader_instance - - # Call the method under test - process_s3_bucket(bucket_name, prefix) - - # Verify that list_objects_v2 was called with the correct parameters - mock_s3_client.list_objects_v2.assert_called_once_with(Bucket=bucket_name, Prefix=prefix) - - # Verify that download_file was called for each .jp2 file - expected_download_calls = [ - unittest.mock.call(bucket_name, "file1.jp2", "/tmp/file1.jp2"), - unittest.mock.call(bucket_name, "file2.jp2", "/tmp/file2.jp2"), - ] - self.assertEqual(mock_s3_client.download_file.call_args_list, expected_download_calls) - - # Verify that BoxReader was instantiated with the correct download paths - expected_boxreader_calls = [ - unittest.mock.call("/tmp/file1.jp2"), - unittest.mock.call("/tmp/file2.jp2"), - ] - self.assertEqual(mock_box_reader.call_args_list, expected_boxreader_calls) - - # Verify that read_jp2_file was called for each .jp2 file - self.assertEqual(mock_reader_instance.read_jp2_file.call_count, 2) - - # Verify that upload_file was called for each .jp2 file - upload_calls = mock_s3_client.upload_file.call_args_list - self.assertEqual(len(upload_calls), 2) - for call in upload_calls: - args, _ = call - local_file_path = args[0] - upload_bucket = args[1] - upload_key = args[2] - # Check that the local file path includes '_modified_' and ends with '.jp2' - self.assertIn("_modified_", local_file_path) - self.assertTrue(local_file_path.endswith(".jp2")) - # Check that the upload is to the correct bucket and key - self.assertEqual(upload_bucket, bucket_name) - self.assertIn("_modified_", upload_key) - self.assertTrue(upload_key.endswith(".jp2")) - - # Verify that print was called correctly - expected_print_calls = [ - unittest.mock.call(f"Processing file: file1.jp2 from bucket {bucket_name}"), - unittest.mock.call(f"Processing file: file2.jp2 from bucket {bucket_name}"), - ] - mock_print.assert_has_calls(expected_print_calls, any_order=True) - # Test for process_trc_tag: when trc_tag_size != curv_trc_field_length def test_process_trc_tag_size_mismatch(self): # Prepare test data where trc_tag_size does not match curv_trc_field_length diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py new file mode 100644 index 0000000..1122ae8 --- /dev/null +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -0,0 +1,125 @@ +import unittest +import pytest +from unittest.mock import call, patch, MagicMock +from jp2_remediator.processor import Processor + + +class TestProcessor: + + @pytest.fixture + def mock_box_reader_factory(self): + return MagicMock() + + @pytest.fixture + def processor(self, mock_box_reader_factory): + return Processor(mock_box_reader_factory) + + # Test for process_file function + @patch("builtins.print") + def test_process_file(self, mock_print, processor, mock_box_reader_factory): + # Define the file path + file_path = "test_file.jp2" + + # Call process_file with the test file path + processor.process_file(file_path) + + # Check that the file was processed + mock_print.assert_called_once_with(f"Processing file: {file_path}") + + # Ensure the BoxReader instance had its read_jp2_file method called + mock_box_reader_factory.get_reader.assert_called_once_with(file_path) + mock_box_reader_factory.get_reader.return_value.read_jp2_file.assert_called_once() + + # Test for process_directory function + @patch("os.walk", return_value=[("root", [], ["file1.jp2", "file2.jp2"])]) + @patch("builtins.print") + def test_process_directory_with_multiple_files( + self, mock_print, mock_os_walk, processor, mock_box_reader_factory + ): + # Call process_directory with a dummy path + processor.process_directory("dummy_path") + + # Check that each JP2 file in the directory was processed + mock_print.assert_any_call("Processing file: root/file1.jp2") + mock_print.assert_any_call("Processing file: root/file2.jp2") + + # Ensure each BoxReader instance had its read_jp2_file method called + assert mock_box_reader_factory.get_reader.call_count == 2 + + # Ensure each BoxReader instance was created with the correct file path + assert mock_box_reader_factory.get_reader.call_args_list == [ + call("root/file1.jp2"), + call("root/file2.jp2"), + ] + assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 + + # Test for process_s3_bucket function + @patch("jp2_remediator.processor.boto3.client") + @patch("builtins.print") + def test_process_s3_bucket(self, mock_print, mock_boto3_client, processor, mock_box_reader_factory): + # Set up the mock S3 client + mock_s3_client = MagicMock() + mock_boto3_client.return_value = mock_s3_client + + # Define the bucket name and prefix + bucket_name = "test-bucket" + prefix = "test-prefix" + + # Prepare a fake response for list_objects_v2 + mock_s3_client.list_objects_v2.return_value = { + "Contents": [ + {"Key": "file1.jp2"}, + {"Key": "file2.jp2"}, + {"Key": "file3.txt"}, # Non-JP2 file to test filtering + ] + } + + # Mock download_file and upload_file methods + mock_s3_client.download_file.return_value = None + mock_s3_client.upload_file.return_value = None + + # Call the method under test + processor.process_s3_bucket(bucket_name, prefix) + + # Verify that list_objects_v2 was called with the correct parameters + mock_s3_client.list_objects_v2.assert_called_once_with(Bucket=bucket_name, Prefix=prefix) + + # Verify that download_file was called for each .jp2 file + expected_download_calls = [ + unittest.mock.call(bucket_name, "file1.jp2", "/tmp/file1.jp2"), + unittest.mock.call(bucket_name, "file2.jp2", "/tmp/file2.jp2"), + ] + assert mock_s3_client.download_file.call_args_list == expected_download_calls + + # Verify that BoxReader was instantiated with the correct download paths + expected_boxreader_calls = [ + unittest.mock.call("/tmp/file1.jp2"), + unittest.mock.call("/tmp/file2.jp2"), + ] + assert mock_box_reader_factory.get_reader.call_args_list == expected_boxreader_calls + + # Verify that read_jp2_file was called for each .jp2 file + assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 + + # Verify that upload_file was called for each .jp2 file + upload_calls = mock_s3_client.upload_file.call_args_list + assert len(upload_calls) == 2 + for c in upload_calls: + args, _ = c + local_file_path = args[0] + upload_bucket = args[1] + upload_key = args[2] + # Check that the local file path includes '_modified_' and ends with '.jp2' + assert "_modified_" in local_file_path, "'_modified_' should be in local_file_path" + assert local_file_path.endswith(".jp2") + # Check that the upload is to the correct bucket and key + assert upload_bucket == bucket_name + assert "_modified_" in upload_key + assert upload_key.endswith(".jp2") + + # Verify that print was called correctly + expected_print_calls = [ + unittest.mock.call(f"Processing file: file1.jp2 from bucket {bucket_name}"), + unittest.mock.call(f"Processing file: file2.jp2 from bucket {bucket_name}"), + ] + mock_print.assert_has_calls(expected_print_calls, any_order=True)