From ea1d09ff7d1e976758ddb85a16502f11b80302ef Mon Sep 17 00:00:00 2001 From: kim pham Date: Wed, 18 Dec 2024 09:38:30 +0100 Subject: [PATCH 01/11] added s3 output bucket and folder arguments --- src/jp2_remediator/main.py | 13 ++++++++++--- src/jp2_remediator/processor.py | 31 ++++++++++++++----------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/jp2_remediator/main.py b/src/jp2_remediator/main.py index 2ccf693..d641dce 100644 --- a/src/jp2_remediator/main.py +++ b/src/jp2_remediator/main.py @@ -44,11 +44,18 @@ def main(): "bucket", help="Name of the AWS S3 bucket to process JP2 files from" ) bucket_parser.add_argument( - "--prefix", help="Prefix of files in the AWS S3 bucket (optional)", - default="" + "--prefix", help="Prefix of files in the AWS S3 bucket (optional)", default="" + ) + bucket_parser.add_argument( + "--output-bucket", help="Name of the AWS S3 bucket to upload modified files (optional)" + ) + bucket_parser.add_argument( + "--output-prefix", help="Prefix for uploaded files in the output bucket (optional)", default="" ) bucket_parser.set_defaults( - func=lambda args: processor.process_s3_bucket(args.bucket, args.prefix) + func=lambda args: processor.process_s3_bucket( + args.bucket, args.prefix, args.output_bucket, args.output_prefix + ) ) args = parser.parse_args() diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py index 7727ad8..10e8aa8 100644 --- a/src/jp2_remediator/processor.py +++ b/src/jp2_remediator/processor.py @@ -16,17 +16,7 @@ def process_file(self, file_path): reader = self.box_reader_factory.get_reader(file_path) reader.read_jp2_file() - def process_directory(self, directory_path): - """Process all JP2 files in a given directory.""" - for root, _, files in os.walk(directory_path): - for file in files: - if file.lower().endswith(".jp2"): - file_path = os.path.join(root, file) - print(f"Processing file: {file_path}") - reader = self.box_reader_factory.get_reader(file_path) - reader.read_jp2_file() - - def process_s3_bucket(self, bucket_name, prefix=""): + def process_s3_bucket(self, bucket_name, prefix="", output_bucket_name=None, output_prefix=""): """Process all JP2 files in a given S3 bucket.""" s3 = boto3.client("s3") response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) @@ -46,10 +36,17 @@ def process_s3_bucket(self, bucket_name, prefix=""): timestamp = datetime.datetime.now().strftime( "%Y%m%d" ) # use "%Y%m%d_%H%M%S" for more precision - s3.upload_file( - download_path.replace( - ".jp2", f"_modified_{timestamp}.jp2" - ), - bucket_name, - file_path.replace(".jp2", f"_modified_{timestamp}.jp2") + modified_file_path = download_path.replace( + ".jp2", f"_modified_{timestamp}.jp2" ) + if os.path.exists(modified_file_path): + target_bucket = output_bucket_name if output_bucket_name else bucket_name + target_key = os.path.join(output_prefix, os.path.basename(modified_file_path)) + # Creates new s3 folder if output_prefix doesn't exist + s3.upload_file( + modified_file_path, + target_bucket, + target_key + ) + else: + print(f"File {modified_file_path} does not exist, skipping upload.") From 9f4cb0de6bf2d922d802548774d1c765de4b3b32 Mon Sep 17 00:00:00 2001 From: kim pham Date: Wed, 18 Dec 2024 10:10:24 +0100 Subject: [PATCH 02/11] added new test for output bucket and dir --- src/jp2_remediator/processor.py | 11 ++++ .../tests/unit/test_processor.py | 62 +++++++++++-------- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py index 10e8aa8..95b9040 100644 --- a/src/jp2_remediator/processor.py +++ b/src/jp2_remediator/processor.py @@ -16,6 +16,17 @@ def process_file(self, file_path): reader = self.box_reader_factory.get_reader(file_path) reader.read_jp2_file() + def process_directory(self, directory_path): + """Process all JP2 files in a given directory.""" + for root, _, files in os.walk(directory_path): + for file in files: + if file.lower().endswith(".jp2"): + file_path = os.path.join(root, file) + print(f"Processing file: {file_path}") + reader = self.box_reader_factory.get_reader(file_path) + reader.read_jp2_file() + + def process_s3_bucket(self, bucket_name, prefix="", output_bucket_name=None, output_prefix=""): """Process all JP2 files in a given S3 bucket.""" s3 = boto3.client("s3") diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 1122ae8..3190761 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -53,24 +53,29 @@ def test_process_directory_with_multiple_files( ] assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 - # Test for process_s3_bucket function - @patch("jp2_remediator.processor.boto3.client") - @patch("builtins.print") - def test_process_s3_bucket(self, mock_print, mock_boto3_client, processor, mock_box_reader_factory): + # Test for process_s3_bucket function with output_bucket and output_prefix + @patch("jp2_remediator.processor.boto3.client", autospec=True) + @patch("builtins.print", autospec=True) + def test_process_s3_bucket_with_output_options( + self, mock_print, mock_boto3_client, processor, mock_box_reader_factory + ): + # Ensure processor is the actual Processor object + assert hasattr(processor, "process_s3_bucket"), "Processor object expected" # Set up the mock S3 client mock_s3_client = MagicMock() mock_boto3_client.return_value = mock_s3_client - # Define the bucket name and prefix + # Define the bucket name, prefix, output bucket, and output prefix bucket_name = "test-bucket" prefix = "test-prefix" + output_bucket = "output-bucket" + output_prefix = "output-prefix/" # Prepare a fake response for list_objects_v2 mock_s3_client.list_objects_v2.return_value = { "Contents": [ {"Key": "file1.jp2"}, {"Key": "file2.jp2"}, - {"Key": "file3.txt"}, # Non-JP2 file to test filtering ] } @@ -79,7 +84,7 @@ def test_process_s3_bucket(self, mock_print, mock_boto3_client, processor, mock_ mock_s3_client.upload_file.return_value = None # Call the method under test - processor.process_s3_bucket(bucket_name, prefix) + processor.process_s3_bucket(bucket_name, prefix, output_bucket, output_prefix) # Verify that list_objects_v2 was called with the correct parameters mock_s3_client.list_objects_v2.assert_called_once_with(Bucket=bucket_name, Prefix=prefix) @@ -98,28 +103,31 @@ def test_process_s3_bucket(self, mock_print, mock_boto3_client, processor, mock_ ] assert mock_box_reader_factory.get_reader.call_args_list == expected_boxreader_calls - # Verify that read_jp2_file was called for each .jp2 file - assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 + # Verify that upload_file was called for each .jp2 file with output_bucket and output_prefix + expected_upload_calls = [ + unittest.mock.call( + "/tmp/file1_modified_.jp2", + output_bucket, + f"{output_prefix}file1_modified_.jp2" + ), + unittest.mock.call( + "/tmp/file2_modified_.jp2", + output_bucket, + f"{output_prefix}file2_modified_.jp2" + ), + ] + + for actual_call, expected_call in zip(mock_s3_client.upload_file.call_args_list, expected_upload_calls): + actual_args, _ = actual_call + expected_args, _ = expected_call + # Verify bucket, key, and file path + assert actual_args[0].startswith("/tmp/") # Local file path + assert actual_args[1] == expected_args[1] # Output bucket + assert actual_args[2].startswith(expected_args[2][:len(output_prefix)]) # Output prefix - # Verify that upload_file was called for each .jp2 file - upload_calls = mock_s3_client.upload_file.call_args_list - assert len(upload_calls) == 2 - for c in upload_calls: - args, _ = c - local_file_path = args[0] - upload_bucket = args[1] - upload_key = args[2] - # Check that the local file path includes '_modified_' and ends with '.jp2' - assert "_modified_" in local_file_path, "'_modified_' should be in local_file_path" - assert local_file_path.endswith(".jp2") - # Check that the upload is to the correct bucket and key - assert upload_bucket == bucket_name - assert "_modified_" in upload_key - assert upload_key.endswith(".jp2") - - # Verify that print was called correctly + # Verify print calls for processing files expected_print_calls = [ unittest.mock.call(f"Processing file: file1.jp2 from bucket {bucket_name}"), unittest.mock.call(f"Processing file: file2.jp2 from bucket {bucket_name}"), ] - mock_print.assert_has_calls(expected_print_calls, any_order=True) + mock_print.assert_has_calls(expected_print_calls, any_order=True) \ No newline at end of file From d59a03bc1960df95db685799ada51081ac8cf7e3 Mon Sep 17 00:00:00 2001 From: kim pham Date: Wed, 18 Dec 2024 10:20:16 +0100 Subject: [PATCH 03/11] clean up some flake8 linting flags --- src/jp2_remediator/processor.py | 1 - src/jp2_remediator/tests/unit/test_processor.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py index 95b9040..7690838 100644 --- a/src/jp2_remediator/processor.py +++ b/src/jp2_remediator/processor.py @@ -26,7 +26,6 @@ def process_directory(self, directory_path): reader = self.box_reader_factory.get_reader(file_path) reader.read_jp2_file() - def process_s3_bucket(self, bucket_name, prefix="", output_bucket_name=None, output_prefix=""): """Process all JP2 files in a given S3 bucket.""" s3 = boto3.client("s3") diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 3190761..34d890f 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -57,8 +57,7 @@ def test_process_directory_with_multiple_files( @patch("jp2_remediator.processor.boto3.client", autospec=True) @patch("builtins.print", autospec=True) def test_process_s3_bucket_with_output_options( - self, mock_print, mock_boto3_client, processor, mock_box_reader_factory - ): + self, mock_print, mock_boto3_client, processor, mock_box_reader_factory): # Ensure processor is the actual Processor object assert hasattr(processor, "process_s3_bucket"), "Processor object expected" # Set up the mock S3 client @@ -130,4 +129,4 @@ def test_process_s3_bucket_with_output_options( unittest.mock.call(f"Processing file: file1.jp2 from bucket {bucket_name}"), unittest.mock.call(f"Processing file: file2.jp2 from bucket {bucket_name}"), ] - mock_print.assert_has_calls(expected_print_calls, any_order=True) \ No newline at end of file + mock_print.assert_has_calls(expected_print_calls, any_order=True) From a8baf0f38531c62d70e80d8e6ac18ead7211f987 Mon Sep 17 00:00:00 2001 From: kim pham Date: Wed, 18 Dec 2024 10:33:55 +0100 Subject: [PATCH 04/11] clean up test --- src/jp2_remediator/tests/unit/test_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 34d890f..092a5db 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -2,6 +2,7 @@ import pytest from unittest.mock import call, patch, MagicMock from jp2_remediator.processor import Processor +import os class TestProcessor: From 52b29b891f908cd6a499f317c4ca75bc8206b9f8 Mon Sep 17 00:00:00 2001 From: kim pham Date: Wed, 18 Dec 2024 10:35:14 +0100 Subject: [PATCH 05/11] clean up test --- src/jp2_remediator/tests/unit/test_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 092a5db..34d890f 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -2,7 +2,6 @@ import pytest from unittest.mock import call, patch, MagicMock from jp2_remediator.processor import Processor -import os class TestProcessor: From b49265c73921c3f656c147d773e55b34c840038b Mon Sep 17 00:00:00 2001 From: kim pham Date: Wed, 18 Dec 2024 10:45:21 +0100 Subject: [PATCH 06/11] coverage test again --- .../tests/unit/test_processor.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 34d890f..5b3da72 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -2,6 +2,7 @@ import pytest from unittest.mock import call, patch, MagicMock from jp2_remediator.processor import Processor +import os class TestProcessor: @@ -130,3 +131,46 @@ def test_process_s3_bucket_with_output_options( unittest.mock.call(f"Processing file: file2.jp2 from bucket {bucket_name}"), ] mock_print.assert_has_calls(expected_print_calls, any_order=True) + + @patch("jp2_remediator.processor.os.path.exists", return_value=True) + @patch("jp2_remediator.processor.boto3.client", autospec=True) + @patch("builtins.print", autospec=True) + def test_upload_modified_file( + self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + ): + # Set up the mock S3 client + mock_s3_client = MagicMock() + mock_boto3_client.return_value = mock_s3_client + + # Define test parameters + bucket_name = "test-bucket" + prefix = "test-prefix" + output_bucket_name = "output-bucket" + output_prefix = "output-prefix/" + modified_file_path = "/tmp/test_file_modified_20241218.jp2" + + # Prepare a fake response for list_objects_v2 + mock_s3_client.list_objects_v2.return_value = { + "Contents": [ + {"Key": "test_file.jp2"}, + ] + } + + # Mock download_file to simulate downloading the file + mock_s3_client.download_file.return_value = None + + # Call the method under test + processor.process_s3_bucket(bucket_name, prefix, output_bucket_name, output_prefix) + + # Verify that os.path.exists was called + mock_os_path_exists.assert_called_with(modified_file_path) + + # Verify that upload_file was called with the correct arguments + mock_s3_client.upload_file.assert_called_once_with( + modified_file_path, + output_bucket_name, + os.path.join(output_prefix, os.path.basename(modified_file_path)) + ) + + # Verify print statements for processing and upload + mock_print.assert_any_call(f"Processing file: test_file.jp2 from bucket {bucket_name}") From 3f3e2b3fd57b763302b0df0c3f9ae2119b3fcdab Mon Sep 17 00:00:00 2001 From: kim pham Date: Thu, 19 Dec 2024 19:45:59 +0100 Subject: [PATCH 07/11] new s3 an s3-file arguments --- bin/docker-run.sh | 14 +- src/jp2_remediator/main.py | 45 +++- src/jp2_remediator/processor.py | 67 +++--- .../tests/unit/test_processor.py | 199 +++++++----------- 4 files changed, 169 insertions(+), 156 deletions(-) diff --git a/bin/docker-run.sh b/bin/docker-run.sh index a251b00..deacbff 100755 --- a/bin/docker-run.sh +++ b/bin/docker-run.sh @@ -4,4 +4,16 @@ #docker run --rm --mount type=bind,source=${PWD},target=/app -it --entrypoint /bin/bash artifactory.huit.harvard.edu/lts/jp2_remediator $@ ## Executable Docker image -docker run --rm --mount type=bind,source=${PWD},target=/data -it artifactory.huit.harvard.edu/lts/jp2_remediator $@ +# docker run --rm --mount type=bind,source=${PWD},target=/app -it artifactory.huit.harvard.edu/lts/jp2_remediator $@ + +# Create an .env file in the same directory with AWS credentials +set -a +source $(dirname "$0")/.env +set +a + +docker run --rm \ + --mount type=bind,source=${PWD},target=/app \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e AWS_SESSION_TOKEN \ + -it artifactory.huit.harvard.edu/lts/jp2_remediator "$@" \ No newline at end of file diff --git a/src/jp2_remediator/main.py b/src/jp2_remediator/main.py index d641dce..7833b09 100644 --- a/src/jp2_remediator/main.py +++ b/src/jp2_remediator/main.py @@ -37,24 +37,49 @@ def main(): ) # Subparser for processing all JP2 files in an S3 bucket - bucket_parser = subparsers.add_parser( - "bucket", help="Process all JP2 files in an S3 bucket" + s3_bucket_parser = subparsers.add_parser( + "s3", help="Process JP2 files in an S3 bucket" ) - bucket_parser.add_argument( - "bucket", help="Name of the AWS S3 bucket to process JP2 files from" + s3_bucket_parser.add_argument( + "input_bucket", help="Name of the AWS S3 bucket to process JP2 files from" ) - bucket_parser.add_argument( - "--prefix", help="Prefix of files in the AWS S3 bucket (optional)", default="" + s3_bucket_parser.add_argument( + "--input-prefix", help="Prefix of files in the AWS S3 bucket (optional)", default="" ) - bucket_parser.add_argument( + s3_bucket_parser.add_argument( "--output-bucket", help="Name of the AWS S3 bucket to upload modified files (optional)" ) - bucket_parser.add_argument( + s3_bucket_parser.add_argument( "--output-prefix", help="Prefix for uploaded files in the output bucket (optional)", default="" ) - bucket_parser.set_defaults( + s3_bucket_parser.set_defaults( func=lambda args: processor.process_s3_bucket( - args.bucket, args.prefix, args.output_bucket, args.output_prefix + args.input_bucket, args.input_prefix, args.output_bucket, args.output_prefix + ) + ) + + # Subparser for processing a single JP2 file in S3 + s3_file_parser = subparsers.add_parser( + "s3-file", help="Process a single JP2 file in S3" + ) + s3_file_parser.add_argument( + "input_bucket", help="Name of the AWS S3 bucket containing the JP2 file" + ) + s3_file_parser.add_argument( + "--input-key", help="Key (path) of the JP2 file in the S3 bucket", required=True + ) + s3_file_parser.add_argument( + "--output-bucket", help="Name of the AWS S3 bucket to upload the modified file (optional)" + ) + s3_file_parser.add_argument( + "--output-prefix", help="Prefix for the uploaded file in the output bucket (optional)", default="" + ) + s3_file_parser.add_argument( + "--output-key", help="Full key (path) for the uploaded file (overrides output-prefix)" + ) + s3_file_parser.set_defaults( + func=lambda args: processor.process_s3_file( + args.input_bucket, args.input_key, args.output_bucket, args.output_prefix, args.output_key ) ) diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py index 7690838..7ba519b 100644 --- a/src/jp2_remediator/processor.py +++ b/src/jp2_remediator/processor.py @@ -26,37 +26,50 @@ def process_directory(self, directory_path): reader = self.box_reader_factory.get_reader(file_path) reader.read_jp2_file() - def process_s3_bucket(self, bucket_name, prefix="", output_bucket_name=None, output_prefix=""): + def process_s3_file(self, input_bucket, input_key, output_bucket, output_prefix=None, output_key=None): + """Process a specific JP2 file from S3 and upload to a specified S3 location.""" + s3 = boto3.client("s3") + + # Generate the output key dynamically if not explicitly provided + if not output_key: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_key = os.path.join( + output_prefix, + os.path.basename(input_key).replace(".jp2", f"_modified_file_{timestamp}.jp2") + ) + + # Download the file from S3 + download_path = f"/tmp/{os.path.basename(input_key)}" + print(f"Downloading file: {input_key} from bucket: {input_bucket}") + s3.download_file(input_bucket, input_key, download_path) + + # Process the file + reader = self.box_reader_factory.get_reader(download_path) + reader.read_jp2_file() + + # Generate the modified file path + timestamp = datetime.datetime.now().strftime("%Y%m%d") + modified_file_path = download_path.replace(".jp2", f"_modified_{timestamp}.jp2") + + if os.path.exists(modified_file_path): + print(f"Uploading modified file to bucket: {output_bucket}, key: {output_key}") + s3.upload_file(modified_file_path, output_bucket, output_key) + else: + print(f"File {modified_file_path} does not exist, skipping upload.") + + def process_s3_bucket(self, input_bucket, input_prefix, output_bucket, output_prefix): """Process all JP2 files in a given S3 bucket.""" s3 = boto3.client("s3") - response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) + response = s3.list_objects_v2(Bucket=input_bucket, Prefix=input_prefix) if "Contents" in response: for obj in response["Contents"]: if obj["Key"].lower().endswith(".jp2"): - file_path = obj["Key"] - print(f"""Processing file: {file_path} from bucket { - bucket_name - }""") - download_path = f"/tmp/{os.path.basename(file_path)}" - s3.download_file(bucket_name, file_path, download_path) - reader = self.box_reader_factory.get_reader(download_path) - reader.read_jp2_file() - # Optionally, upload modified file back to S3 - timestamp = datetime.datetime.now().strftime( - "%Y%m%d" - ) # use "%Y%m%d_%H%M%S" for more precision - modified_file_path = download_path.replace( - ".jp2", f"_modified_{timestamp}.jp2" + input_key = obj["Key"] + timestamp = datetime.datetime.now().strftime("%Y%m%d") + output_key = os.path.join( + output_prefix, + os.path.basename(input_key).replace(".jp2", f"_modified_{timestamp}.jp2") ) - if os.path.exists(modified_file_path): - target_bucket = output_bucket_name if output_bucket_name else bucket_name - target_key = os.path.join(output_prefix, os.path.basename(modified_file_path)) - # Creates new s3 folder if output_prefix doesn't exist - s3.upload_file( - modified_file_path, - target_bucket, - target_key - ) - else: - print(f"File {modified_file_path} does not exist, skipping upload.") + print(f"Processing file: {input_key} from bucket: {input_bucket}") + self.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 5b3da72..1ee2672 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -1,8 +1,6 @@ -import unittest import pytest -from unittest.mock import call, patch, MagicMock +from unittest.mock import patch, MagicMock from jp2_remediator.processor import Processor -import os class TestProcessor: @@ -15,162 +13,127 @@ def mock_box_reader_factory(self): def processor(self, mock_box_reader_factory): return Processor(mock_box_reader_factory) - # Test for process_file function @patch("builtins.print") def test_process_file(self, mock_print, processor, mock_box_reader_factory): - # Define the file path file_path = "test_file.jp2" - - # Call process_file with the test file path processor.process_file(file_path) - - # Check that the file was processed mock_print.assert_called_once_with(f"Processing file: {file_path}") - - # Ensure the BoxReader instance had its read_jp2_file method called mock_box_reader_factory.get_reader.assert_called_once_with(file_path) mock_box_reader_factory.get_reader.return_value.read_jp2_file.assert_called_once() - # Test for process_directory function @patch("os.walk", return_value=[("root", [], ["file1.jp2", "file2.jp2"])]) @patch("builtins.print") - def test_process_directory_with_multiple_files( - self, mock_print, mock_os_walk, processor, mock_box_reader_factory - ): - # Call process_directory with a dummy path + def test_process_directory_with_multiple_files(self, mock_print, mock_os_walk, processor, mock_box_reader_factory): processor.process_directory("dummy_path") - - # Check that each JP2 file in the directory was processed mock_print.assert_any_call("Processing file: root/file1.jp2") mock_print.assert_any_call("Processing file: root/file2.jp2") - - # Ensure each BoxReader instance had its read_jp2_file method called assert mock_box_reader_factory.get_reader.call_count == 2 - - # Ensure each BoxReader instance was created with the correct file path - assert mock_box_reader_factory.get_reader.call_args_list == [ - call("root/file1.jp2"), - call("root/file2.jp2"), - ] assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 - # Test for process_s3_bucket function with output_bucket and output_prefix + @patch("jp2_remediator.processor.os.path.exists", return_value=True) @patch("jp2_remediator.processor.boto3.client", autospec=True) @patch("builtins.print", autospec=True) - def test_process_s3_bucket_with_output_options( - self, mock_print, mock_boto3_client, processor, mock_box_reader_factory): - # Ensure processor is the actual Processor object - assert hasattr(processor, "process_s3_bucket"), "Processor object expected" - # Set up the mock S3 client + def test_process_s3_file_without_output_key_or_prefix( + self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + ): + # No explicit output_key or prefix given, output_prefix is empty string mock_s3_client = MagicMock() mock_boto3_client.return_value = mock_s3_client + input_bucket = "test-bucket" + input_key = "some_folder/my_image.jp2" + mock_s3_client.download_file.return_value = None + mock_s3_client.upload_file.return_value = None - # Define the bucket name, prefix, output bucket, and output prefix - bucket_name = "test-bucket" - prefix = "test-prefix" - output_bucket = "output-bucket" - output_prefix = "output-prefix/" + processor.process_s3_file( + input_bucket=input_bucket, + input_key=input_key, + output_bucket=input_bucket, + output_prefix="" # Ensures dynamic output_key creation without None issues + ) + + mock_s3_client.download_file.assert_called_once() + mock_box_reader_factory.get_reader.assert_called_once() + mock_box_reader_factory.get_reader.return_value.read_jp2_file.assert_called_once() + assert any("Uploading modified file to bucket:" in c.args[0] for c in mock_print.call_args_list) - # Prepare a fake response for list_objects_v2 + @patch("jp2_remediator.processor.os.path.exists", return_value=True) + @patch("jp2_remediator.processor.boto3.client", autospec=True) + @patch("builtins.print", autospec=True) + def test_process_s3_bucket_with_jp2_files( + self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + ): + mock_s3_client = MagicMock() + mock_boto3_client.return_value = mock_s3_client + input_bucket = "test-input-bucket" + input_prefix = "some-prefix/" + output_bucket = "test-output-bucket" + output_prefix = "processed/" mock_s3_client.list_objects_v2.return_value = { "Contents": [ - {"Key": "file1.jp2"}, - {"Key": "file2.jp2"}, + {"Key": "some-prefix/image1.jp2"}, + {"Key": "some-prefix/image2.jp2"} ] } - - # Mock download_file and upload_file methods mock_s3_client.download_file.return_value = None mock_s3_client.upload_file.return_value = None - # Call the method under test - processor.process_s3_bucket(bucket_name, prefix, output_bucket, output_prefix) - - # Verify that list_objects_v2 was called with the correct parameters - mock_s3_client.list_objects_v2.assert_called_once_with(Bucket=bucket_name, Prefix=prefix) - - # Verify that download_file was called for each .jp2 file - expected_download_calls = [ - unittest.mock.call(bucket_name, "file1.jp2", "/tmp/file1.jp2"), - unittest.mock.call(bucket_name, "file2.jp2", "/tmp/file2.jp2"), - ] - assert mock_s3_client.download_file.call_args_list == expected_download_calls - - # Verify that BoxReader was instantiated with the correct download paths - expected_boxreader_calls = [ - unittest.mock.call("/tmp/file1.jp2"), - unittest.mock.call("/tmp/file2.jp2"), - ] - assert mock_box_reader_factory.get_reader.call_args_list == expected_boxreader_calls - - # Verify that upload_file was called for each .jp2 file with output_bucket and output_prefix - expected_upload_calls = [ - unittest.mock.call( - "/tmp/file1_modified_.jp2", - output_bucket, - f"{output_prefix}file1_modified_.jp2" - ), - unittest.mock.call( - "/tmp/file2_modified_.jp2", - output_bucket, - f"{output_prefix}file2_modified_.jp2" - ), - ] - - for actual_call, expected_call in zip(mock_s3_client.upload_file.call_args_list, expected_upload_calls): - actual_args, _ = actual_call - expected_args, _ = expected_call - # Verify bucket, key, and file path - assert actual_args[0].startswith("/tmp/") # Local file path - assert actual_args[1] == expected_args[1] # Output bucket - assert actual_args[2].startswith(expected_args[2][:len(output_prefix)]) # Output prefix - - # Verify print calls for processing files - expected_print_calls = [ - unittest.mock.call(f"Processing file: file1.jp2 from bucket {bucket_name}"), - unittest.mock.call(f"Processing file: file2.jp2 from bucket {bucket_name}"), - ] - mock_print.assert_has_calls(expected_print_calls, any_order=True) + processor.process_s3_bucket(input_bucket, input_prefix, output_bucket, output_prefix) + + mock_s3_client.list_objects_v2.assert_called_once() + mock_print.assert_any_call(f"Processing file: some-prefix/image1.jp2 from bucket: {input_bucket}") + mock_print.assert_any_call(f"Processing file: some-prefix/image2.jp2 from bucket: {input_bucket}") + assert mock_box_reader_factory.get_reader.call_count == 2 + assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 + + @patch("jp2_remediator.processor.boto3.client", autospec=True) + @patch("builtins.print", autospec=True) + def test_process_s3_bucket_empty_response(self, mock_print, mock_boto3_client, processor): + mock_s3_client = MagicMock() + mock_boto3_client.return_value = mock_s3_client + mock_s3_client.list_objects_v2.return_value = {} + processor.process_s3_bucket("test-bucket", "test-prefix/", "output-bucket", "output-prefix/") + mock_print.assert_not_called() + mock_s3_client.upload_file.assert_not_called() @patch("jp2_remediator.processor.os.path.exists", return_value=True) @patch("jp2_remediator.processor.boto3.client", autospec=True) @patch("builtins.print", autospec=True) - def test_upload_modified_file( - self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + def test_process_s3_bucket_skip_non_jp2_files( + self, mock_print, mock_boto3_client, mock_os_path_exists, processor ): - # Set up the mock S3 client mock_s3_client = MagicMock() mock_boto3_client.return_value = mock_s3_client - - # Define test parameters - bucket_name = "test-bucket" - prefix = "test-prefix" - output_bucket_name = "output-bucket" - output_prefix = "output-prefix/" - modified_file_path = "/tmp/test_file_modified_20241218.jp2" - - # Prepare a fake response for list_objects_v2 mock_s3_client.list_objects_v2.return_value = { "Contents": [ - {"Key": "test_file.jp2"}, + {"Key": "test-prefix/file1.jp2"}, + {"Key": "test-prefix/file2.txt"}, + {"Key": "test-prefix/file3.jpg"}, ] } + processor.process_s3_bucket("test-bucket", "test-prefix/", "output-bucket", "output-prefix/") + mock_print.assert_any_call("Processing file: test-prefix/file1.jp2 from bucket: test-bucket") + mock_s3_client.upload_file.assert_called_once() - # Mock download_file to simulate downloading the file + @patch("jp2_remediator.processor.os.path.exists", side_effect=[True, False]) + @patch("jp2_remediator.processor.boto3.client", autospec=True) + @patch("builtins.print", autospec=True) + def test_process_s3_file_upload_logic( + self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + ): + mock_s3_client = MagicMock() + mock_boto3_client.return_value = mock_s3_client + input_bucket = "test-bucket" + input_key = "test-folder/file.jp2" + output_bucket = "output-bucket" + output_key = "output-folder/file_modified.jp2" mock_s3_client.download_file.return_value = None + mock_s3_client.upload_file.return_value = None - # Call the method under test - processor.process_s3_bucket(bucket_name, prefix, output_bucket_name, output_prefix) - - # Verify that os.path.exists was called - mock_os_path_exists.assert_called_with(modified_file_path) - - # Verify that upload_file was called with the correct arguments - mock_s3_client.upload_file.assert_called_once_with( - modified_file_path, - output_bucket_name, - os.path.join(output_prefix, os.path.basename(modified_file_path)) - ) + # First call (file exists) + processor.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) + # Second call (file does not exist after modification) + processor.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) - # Verify print statements for processing and upload - mock_print.assert_any_call(f"Processing file: test_file.jp2 from bucket {bucket_name}") + mock_print.assert_any_call(f"Uploading modified file to bucket: {output_bucket}, key: {output_key}") + all_prints = [c.args[0] for c in mock_print.call_args_list] + assert any("does not exist, skipping upload." in p for p in all_prints) From fd06b0ca8c058c210545ba4159ca5ecad54e8872 Mon Sep 17 00:00:00 2001 From: kim pham Date: Thu, 19 Dec 2024 19:49:21 +0100 Subject: [PATCH 08/11] add docker file --- bin/docker-run.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bin/docker-run.sh b/bin/docker-run.sh index deacbff..772e751 100755 --- a/bin/docker-run.sh +++ b/bin/docker-run.sh @@ -4,16 +4,16 @@ #docker run --rm --mount type=bind,source=${PWD},target=/app -it --entrypoint /bin/bash artifactory.huit.harvard.edu/lts/jp2_remediator $@ ## Executable Docker image -# docker run --rm --mount type=bind,source=${PWD},target=/app -it artifactory.huit.harvard.edu/lts/jp2_remediator $@ +docker run --rm --mount type=bind,source=${PWD},target=/data -it artifactory.huit.harvard.edu/lts/jp2_remediator $@ -# Create an .env file in the same directory with AWS credentials -set -a -source $(dirname "$0")/.env -set +a +## Passing AWS credentials: If you want to use the s3 bucket option, create an .env file in the same directory with AWS credentials +# set -a +# source $(dirname "$0")/.env +# set +a -docker run --rm \ - --mount type=bind,source=${PWD},target=/app \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e AWS_SESSION_TOKEN \ - -it artifactory.huit.harvard.edu/lts/jp2_remediator "$@" \ No newline at end of file +# docker run --rm \ +# --mount type=bind,source=${PWD},target=/data \ +# -e AWS_ACCESS_KEY_ID \ +# -e AWS_SECRET_ACCESS_KEY \ +# -e AWS_SESSION_TOKEN \ +# -it artifactory.huit.harvard.edu/lts/jp2_remediator "$@" \ No newline at end of file From 7f3dadfe5a641df2d467c85257ce1895b6eba8e5 Mon Sep 17 00:00:00 2001 From: kim pham Date: Thu, 19 Dec 2024 22:17:10 +0100 Subject: [PATCH 09/11] remove hhmmss to timestamp of s3-file --- src/jp2_remediator/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py index 7ba519b..78a51c1 100644 --- a/src/jp2_remediator/processor.py +++ b/src/jp2_remediator/processor.py @@ -32,7 +32,7 @@ def process_s3_file(self, input_bucket, input_key, output_bucket, output_prefix= # Generate the output key dynamically if not explicitly provided if not output_key: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.datetime.now().strftime("%Y%m%d") output_key = os.path.join( output_prefix, os.path.basename(input_key).replace(".jp2", f"_modified_file_{timestamp}.jp2") From 43631b12131875baa0cb82dd3caa5b062929dad2 Mon Sep 17 00:00:00 2001 From: kim pham Date: Tue, 24 Dec 2024 10:32:41 +0100 Subject: [PATCH 10/11] remove s3bucket option, change s3 arg requirements, update tests, print to logger statements --- src/jp2_remediator/main.py | 31 +--- src/jp2_remediator/processor.py | 32 +--- .../tests/unit/test_processor.py | 153 +++++++----------- 3 files changed, 72 insertions(+), 144 deletions(-) diff --git a/src/jp2_remediator/main.py b/src/jp2_remediator/main.py index 7833b09..9780490 100644 --- a/src/jp2_remediator/main.py +++ b/src/jp2_remediator/main.py @@ -36,50 +36,25 @@ def main(): func=lambda args: processor.process_directory(args.directory) ) - # Subparser for processing all JP2 files in an S3 bucket - s3_bucket_parser = subparsers.add_parser( - "s3", help="Process JP2 files in an S3 bucket" - ) - s3_bucket_parser.add_argument( - "input_bucket", help="Name of the AWS S3 bucket to process JP2 files from" - ) - s3_bucket_parser.add_argument( - "--input-prefix", help="Prefix of files in the AWS S3 bucket (optional)", default="" - ) - s3_bucket_parser.add_argument( - "--output-bucket", help="Name of the AWS S3 bucket to upload modified files (optional)" - ) - s3_bucket_parser.add_argument( - "--output-prefix", help="Prefix for uploaded files in the output bucket (optional)", default="" - ) - s3_bucket_parser.set_defaults( - func=lambda args: processor.process_s3_bucket( - args.input_bucket, args.input_prefix, args.output_bucket, args.output_prefix - ) - ) - # Subparser for processing a single JP2 file in S3 s3_file_parser = subparsers.add_parser( "s3-file", help="Process a single JP2 file in S3" ) s3_file_parser.add_argument( - "input_bucket", help="Name of the AWS S3 bucket containing the JP2 file" + "--input-bucket", help="Name of the AWS S3 bucket containing the JP2 file", required=True ) s3_file_parser.add_argument( "--input-key", help="Key (path) of the JP2 file in the S3 bucket", required=True ) s3_file_parser.add_argument( - "--output-bucket", help="Name of the AWS S3 bucket to upload the modified file (optional)" - ) - s3_file_parser.add_argument( - "--output-prefix", help="Prefix for the uploaded file in the output bucket (optional)", default="" + "--output-bucket", help="Name of the AWS S3 bucket to upload the modified file (optional)", required=True ) s3_file_parser.add_argument( "--output-key", help="Full key (path) for the uploaded file (overrides output-prefix)" ) s3_file_parser.set_defaults( func=lambda args: processor.process_s3_file( - args.input_bucket, args.input_key, args.output_bucket, args.output_prefix, args.output_key + args.input_bucket, args.input_key, args.output_bucket, args.output_key ) ) diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py index 78a51c1..d64acd5 100644 --- a/src/jp2_remediator/processor.py +++ b/src/jp2_remediator/processor.py @@ -1,6 +1,7 @@ import datetime import os import boto3 +from jp2_remediator import configure_logger class Processor: @@ -9,10 +10,11 @@ class Processor: def __init__(self, factory): """Initialize the Processor with a BoxReader factory.""" self.box_reader_factory = factory + self.logger = configure_logger(__name__) def process_file(self, file_path): """Process a single JP2 file.""" - print(f"Processing file: {file_path}") + self.logger.info(f"Processing file: {file_path}") reader = self.box_reader_factory.get_reader(file_path) reader.read_jp2_file() @@ -22,11 +24,11 @@ def process_directory(self, directory_path): for file in files: if file.lower().endswith(".jp2"): file_path = os.path.join(root, file) - print(f"Processing file: {file_path}") + self.logger.info(f"Processing file: {file_path}") reader = self.box_reader_factory.get_reader(file_path) reader.read_jp2_file() - def process_s3_file(self, input_bucket, input_key, output_bucket, output_prefix=None, output_key=None): + def process_s3_file(self, input_bucket, input_key, output_bucket, output_key=None): """Process a specific JP2 file from S3 and upload to a specified S3 location.""" s3 = boto3.client("s3") @@ -34,13 +36,12 @@ def process_s3_file(self, input_bucket, input_key, output_bucket, output_prefix= if not output_key: timestamp = datetime.datetime.now().strftime("%Y%m%d") output_key = os.path.join( - output_prefix, os.path.basename(input_key).replace(".jp2", f"_modified_file_{timestamp}.jp2") ) # Download the file from S3 download_path = f"/tmp/{os.path.basename(input_key)}" - print(f"Downloading file: {input_key} from bucket: {input_bucket}") + self.logger.info(f"Downloading file: {input_key} from bucket: {input_bucket}") s3.download_file(input_bucket, input_key, download_path) # Process the file @@ -52,24 +53,7 @@ def process_s3_file(self, input_bucket, input_key, output_bucket, output_prefix= modified_file_path = download_path.replace(".jp2", f"_modified_{timestamp}.jp2") if os.path.exists(modified_file_path): - print(f"Uploading modified file to bucket: {output_bucket}, key: {output_key}") + self.logger.info(f"Uploading modified file to bucket: {output_bucket}, key: {output_key}") s3.upload_file(modified_file_path, output_bucket, output_key) else: - print(f"File {modified_file_path} does not exist, skipping upload.") - - def process_s3_bucket(self, input_bucket, input_prefix, output_bucket, output_prefix): - """Process all JP2 files in a given S3 bucket.""" - s3 = boto3.client("s3") - response = s3.list_objects_v2(Bucket=input_bucket, Prefix=input_prefix) - - if "Contents" in response: - for obj in response["Contents"]: - if obj["Key"].lower().endswith(".jp2"): - input_key = obj["Key"] - timestamp = datetime.datetime.now().strftime("%Y%m%d") - output_key = os.path.join( - output_prefix, - os.path.basename(input_key).replace(".jp2", f"_modified_{timestamp}.jp2") - ) - print(f"Processing file: {input_key} from bucket: {input_bucket}") - self.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) + self.logger.info(f"File {modified_file_path} does not exist, skipping upload.") diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index 1ee2672..c666c3b 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -10,130 +10,99 @@ def mock_box_reader_factory(self): return MagicMock() @pytest.fixture - def processor(self, mock_box_reader_factory): + @patch("jp2_remediator.processor.configure_logger") + def processor(self, mock_configure_logger, mock_box_reader_factory): + mock_logger = MagicMock() + mock_configure_logger.return_value = mock_logger return Processor(mock_box_reader_factory) - @patch("builtins.print") - def test_process_file(self, mock_print, processor, mock_box_reader_factory): + def test_process_file(self, processor, mock_box_reader_factory): file_path = "test_file.jp2" + + # Run the processor method processor.process_file(file_path) - mock_print.assert_called_once_with(f"Processing file: {file_path}") + + # Test that logger.info was called once with the correct message + processor.logger.info.assert_called_once_with(f"Processing file: {file_path}") mock_box_reader_factory.get_reader.assert_called_once_with(file_path) mock_box_reader_factory.get_reader.return_value.read_jp2_file.assert_called_once() @patch("os.walk", return_value=[("root", [], ["file1.jp2", "file2.jp2"])]) - @patch("builtins.print") - def test_process_directory_with_multiple_files(self, mock_print, mock_os_walk, processor, mock_box_reader_factory): + def test_process_directory_with_multiple_files(self, mock_os_walk, processor, mock_box_reader_factory): processor.process_directory("dummy_path") - mock_print.assert_any_call("Processing file: root/file1.jp2") - mock_print.assert_any_call("Processing file: root/file2.jp2") + + # Test that logger.info was called for each file + processor.logger.info.assert_any_call("Processing file: root/file1.jp2") + processor.logger.info.assert_any_call("Processing file: root/file2.jp2") assert mock_box_reader_factory.get_reader.call_count == 2 assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 @patch("jp2_remediator.processor.os.path.exists", return_value=True) @patch("jp2_remediator.processor.boto3.client", autospec=True) - @patch("builtins.print", autospec=True) - def test_process_s3_file_without_output_key_or_prefix( - self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + def test_process_s3_file_with_output_key( + self, mock_boto3_client, mock_os_path_exists, processor ): - # No explicit output_key or prefix given, output_prefix is empty string + """ + When the modified file DOES exist, we expect: + 1) The logger to show 'Downloading file:' and 'Uploading modified file:' + 2) The local file path to contain a wildcard segment (file_modified_). + 3) The upload_file call to use the correct output_bucket/output_key. + """ mock_s3_client = MagicMock() mock_boto3_client.return_value = mock_s3_client - input_bucket = "test-bucket" - input_key = "some_folder/my_image.jp2" - mock_s3_client.download_file.return_value = None - mock_s3_client.upload_file.return_value = None - - processor.process_s3_file( - input_bucket=input_bucket, - input_key=input_key, - output_bucket=input_bucket, - output_prefix="" # Ensures dynamic output_key creation without None issues - ) - mock_s3_client.download_file.assert_called_once() - mock_box_reader_factory.get_reader.assert_called_once() - mock_box_reader_factory.get_reader.return_value.read_jp2_file.assert_called_once() - assert any("Uploading modified file to bucket:" in c.args[0] for c in mock_print.call_args_list) + input_bucket = "test-bucket" + input_key = "test-folder/file.jp2" + output_bucket = "output-bucket" + output_key = "output-folder/file_modified.jp2" - @patch("jp2_remediator.processor.os.path.exists", return_value=True) - @patch("jp2_remediator.processor.boto3.client", autospec=True) - @patch("builtins.print", autospec=True) - def test_process_s3_bucket_with_jp2_files( - self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory - ): - mock_s3_client = MagicMock() - mock_boto3_client.return_value = mock_s3_client - input_bucket = "test-input-bucket" - input_prefix = "some-prefix/" - output_bucket = "test-output-bucket" - output_prefix = "processed/" - mock_s3_client.list_objects_v2.return_value = { - "Contents": [ - {"Key": "some-prefix/image1.jp2"}, - {"Key": "some-prefix/image2.jp2"} - ] - } + # Simulate successful S3 calls mock_s3_client.download_file.return_value = None mock_s3_client.upload_file.return_value = None - processor.process_s3_bucket(input_bucket, input_prefix, output_bucket, output_prefix) - - mock_s3_client.list_objects_v2.assert_called_once() - mock_print.assert_any_call(f"Processing file: some-prefix/image1.jp2 from bucket: {input_bucket}") - mock_print.assert_any_call(f"Processing file: some-prefix/image2.jp2 from bucket: {input_bucket}") - assert mock_box_reader_factory.get_reader.call_count == 2 - assert mock_box_reader_factory.get_reader.return_value.read_jp2_file.call_count == 2 - - @patch("jp2_remediator.processor.boto3.client", autospec=True) - @patch("builtins.print", autospec=True) - def test_process_s3_bucket_empty_response(self, mock_print, mock_boto3_client, processor): - mock_s3_client = MagicMock() - mock_boto3_client.return_value = mock_s3_client - mock_s3_client.list_objects_v2.return_value = {} - processor.process_s3_bucket("test-bucket", "test-prefix/", "output-bucket", "output-prefix/") - mock_print.assert_not_called() - mock_s3_client.upload_file.assert_not_called() + processor.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) - @patch("jp2_remediator.processor.os.path.exists", return_value=True) - @patch("jp2_remediator.processor.boto3.client", autospec=True) - @patch("builtins.print", autospec=True) - def test_process_s3_bucket_skip_non_jp2_files( - self, mock_print, mock_boto3_client, mock_os_path_exists, processor - ): - mock_s3_client = MagicMock() - mock_boto3_client.return_value = mock_s3_client - mock_s3_client.list_objects_v2.return_value = { - "Contents": [ - {"Key": "test-prefix/file1.jp2"}, - {"Key": "test-prefix/file2.txt"}, - {"Key": "test-prefix/file3.jpg"}, - ] - } - processor.process_s3_bucket("test-bucket", "test-prefix/", "output-bucket", "output-prefix/") - mock_print.assert_any_call("Processing file: test-prefix/file1.jp2 from bucket: test-bucket") - mock_s3_client.upload_file.assert_called_once() - - @patch("jp2_remediator.processor.os.path.exists", side_effect=[True, False]) + # 1. Check upload_file with a wildcard in local path + upload_calls = [ + call + for call in mock_s3_client.upload_file.call_args_list + if "/tmp/file_modified_" in call.args[0] # local path wildcard + and call.args[1] == output_bucket + and call.args[2] == output_key + ] + assert len(upload_calls) == 1, "Expected exactly one upload call with wildcard local path." + + # 2. Verify logger calls + all_logger_msgs = [call.args[0] for call in processor.logger.info.mock_calls] + assert any("Downloading file: test-folder/file.jp2 from bucket: test-bucket" in msg for msg in all_logger_msgs), \ + "Expected 'Downloading file:' log not found." + assert any("Uploading modified file to bucket: output-bucket, key: output-folder/file_modified.jp2" in msg for msg in all_logger_msgs), \ + "Expected 'Uploading modified file:' log not found." + + @patch("jp2_remediator.processor.os.path.exists", return_value=False) @patch("jp2_remediator.processor.boto3.client", autospec=True) - @patch("builtins.print", autospec=True) - def test_process_s3_file_upload_logic( - self, mock_print, mock_boto3_client, mock_os_path_exists, processor, mock_box_reader_factory + def test_process_s3_file_file_does_not_exist( + self, mock_boto3_client, mock_os_path_exists, processor ): + """ + When the modified file does NOT exist, we expect: + 1) No upload to S3 (upload_file not called). + 2) A log message stating the file does not exist, skipping upload. + """ mock_s3_client = MagicMock() mock_boto3_client.return_value = mock_s3_client + input_bucket = "test-bucket" input_key = "test-folder/file.jp2" output_bucket = "output-bucket" output_key = "output-folder/file_modified.jp2" + mock_s3_client.download_file.return_value = None - mock_s3_client.upload_file.return_value = None - # First call (file exists) - processor.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) - # Second call (file does not exist after modification) processor.process_s3_file(input_bucket, input_key, output_bucket, output_key=output_key) - mock_print.assert_any_call(f"Uploading modified file to bucket: {output_bucket}, key: {output_key}") - all_prints = [c.args[0] for c in mock_print.call_args_list] - assert any("does not exist, skipping upload." in p for p in all_prints) + mock_s3_client.upload_file.assert_not_called() + + all_logger_msgs = [call.args[0] for call in processor.logger.info.mock_calls] + assert any("does not exist, skipping upload" in msg for msg in all_logger_msgs), \ + "Expected 'skipping upload' log message not found." \ No newline at end of file From 852a372f8f63e17f6a3a681b059e4c6818ee2cd9 Mon Sep 17 00:00:00 2001 From: kim pham Date: Tue, 24 Dec 2024 10:35:42 +0100 Subject: [PATCH 11/11] flake test --- src/jp2_remediator/tests/unit/test_processor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/jp2_remediator/tests/unit/test_processor.py b/src/jp2_remediator/tests/unit/test_processor.py index c666c3b..ff4d1d3 100644 --- a/src/jp2_remediator/tests/unit/test_processor.py +++ b/src/jp2_remediator/tests/unit/test_processor.py @@ -74,9 +74,11 @@ def test_process_s3_file_with_output_key( # 2. Verify logger calls all_logger_msgs = [call.args[0] for call in processor.logger.info.mock_calls] - assert any("Downloading file: test-folder/file.jp2 from bucket: test-bucket" in msg for msg in all_logger_msgs), \ + assert any("Downloading file: test-folder/file.jp2 from bucket: test-bucket" + in msg for msg in all_logger_msgs), \ "Expected 'Downloading file:' log not found." - assert any("Uploading modified file to bucket: output-bucket, key: output-folder/file_modified.jp2" in msg for msg in all_logger_msgs), \ + assert any("Uploading modified file to bucket: output-bucket, key: output-folder/file_modified.jp2" + in msg for msg in all_logger_msgs), \ "Expected 'Uploading modified file:' log not found." @patch("jp2_remediator.processor.os.path.exists", return_value=False) @@ -105,4 +107,4 @@ def test_process_s3_file_file_does_not_exist( all_logger_msgs = [call.args[0] for call in processor.logger.info.mock_calls] assert any("does not exist, skipping upload" in msg for msg in all_logger_msgs), \ - "Expected 'skipping upload' log message not found." \ No newline at end of file + "Expected 'skipping upload' log message not found."