Merge branch 'development' into requirements_update

ONSdigital · Nov 26, 2024 · 93709b4 · 93709b4
2 parents b820cdb + 0cecc6a
commit 93709b4
Show file tree

Hide file tree

Showing 6 changed files with 174 additions and 9 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.6
+current_version = 0.4.1
 commit = False
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,47 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
 
 ### Removed
 
+## [0.4.1] - 2024-11-25
+
+### Added
+
+### Changed
+- Updated the `ons-mkdocs-theme` version number in `doc` requirements in `setup.cfg`.
+
+### Deprecated
+
+### Fixed
+
+### Removed
+
+## [0.4.0] - 2024-11-21
+
+### Added
+
+### Changed
+- Unpinned `pandas` version in `setup.cfg` to allow for more flexibility 
+  in dependency management.
+- Removed `numpy` from `setup.cfg` as it will be installed automatically by `pandas`.
+
+### Deprecated
+
+### Fixed
+
+### Removed
+
+## [v0.3.7] - 2024-11-20
+
+### Added
+- Added `write_csv` function inside `cdp/helpers/s3_utils.py`.
+
+### Changed
+
+### Deprecated
+
+### Fixed
+
+### Removed
+
 ## [v0.3.6] - 2024-10-16
 
 ### Added
@@ -434,7 +475,12 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
 > due to bugs in the GitHub Action `deploy_pypi.yaml`, which deploys to PyPI
 > and GitHub Releases.
 
-
+- rdsa-utils v0.4.1: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.4.1) | 
+  [PyPI](https://pypi.org/project/rdsa-utils/0.4.1/)
+- rdsa-utils v0.4.0: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.4.0) | 
+  [PyPI](https://pypi.org/project/rdsa-utils/0.4.0/)
+- rdsa-utils v0.3.7: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.7) | 
+  [PyPI](https://pypi.org/project/rdsa-utils/0.3.7/)
 - rdsa-utils v0.3.6: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.6) | 
   [PyPI](https://pypi.org/project/rdsa-utils/0.3.6/)
 - rdsa-utils v0.3.5: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.5) | 

diff --git a/rdsa_utils/__init__.py b/rdsa_utils/__init__.py
@@ -1 +1 @@
-__version__ = "0.3.6"
+__version__ = "0.4.1"
diff --git a/rdsa_utils/cdp/helpers/s3_utils.py b/rdsa_utils/cdp/helpers/s3_utils.py
@@ -4,6 +4,7 @@
 and SSL certificate, you can use the following code snippet:
 
 ```python
+
 import boto3
 import raz_client
 
@@ -24,6 +25,7 @@
 
 import json
 import logging
+from io import StringIO
 from pathlib import Path
 from typing import Dict, List, Optional
 
@@ -1042,13 +1044,13 @@ def load_json(
 
     Parameters
     ----------
-    client : boto3.client
+    client
         The boto3 S3 client instance.
-    bucket_name : str
+    bucket_name
         The name of the S3 bucket.
-    filepath : str
+    filepath
         The key (full path and filename) of the JSON file in the S3 bucket.
-    encoding : str, optional
+    encoding
         The encoding of the JSON file. Default is 'utf-8'.
 
     Returns
@@ -1098,3 +1100,79 @@ def load_json(
         raise Exception(error_message) from e
 
     return data
+
+
+def write_csv(
+    client: boto3.client,
+    bucket_name: str,
+    data: pd.DataFrame,
+    filepath: str,
+    **kwargs,
+) -> bool:
+    """Write a Pandas Dataframe to csv in an S3 bucket.
+
+    Uses StringIO library as a RAM buffer, so at first Pandas writes data to the
+    buffer, then the buffer returns to the beginning, and then it is sent to
+    the S3 bucket using the boto3.put_object method.
+
+    Parameters
+    ----------
+    client
+        The boto3 S3 client instance.
+    bucket_name
+        The name of the S3 bucket.
+    data
+        The dataframe to write to the spexified path.
+    filepath
+        The filepath to save the dataframe to.
+    kwargs
+        Optional dictionary of Pandas to_csv arguments.
+
+    Returns
+    -------
+    bool
+        True if the dataframe is written successfully.
+        False if it was not possible to serialise or write the file.
+
+    Raises
+    ------
+    Exception
+        If there is an error writing the file to S3.
+
+    Examples
+    --------
+    >>> s3_client = boto3.client('s3')
+    >>> data = pd.DataFrame({
+    >>>     'column1': [1, 2, 3],
+    >>>     'column2': ['a', 'b', 'c']
+    >>> })
+    >>> write_csv(s3_client, 'my_bucket', data, 'path/to/file.csv')
+    True
+    """
+    try:
+        # Create an Input-Output buffer
+        csv_buffer = StringIO()
+
+        # Write the dataframe to the buffer in the CSV format
+        data.to_csv(csv_buffer, **kwargs)
+
+        # "Rewind" the stream to the start of the buffer
+        csv_buffer.seek(0)
+
+        # Write the buffer into the s3 bucket. Assign the output to a mute
+        # variable, so the output is not displayed in the console or log.
+        _ = client.put_object(
+            Bucket=bucket_name,
+            Body=csv_buffer.getvalue(),
+            Key=filepath,
+        )
+        logger.info(f"Successfully wrote dataframe to {bucket_name}/{filepath}")
+        return True
+
+    except Exception as e:
+        error_message = (
+            f"Error writing to csv or saving to bucket {bucket_name}, "
+            f"filepath {filepath}: {e}"
+        )
+        logger.error(error_message)
+        return False
diff --git a/setup.cfg b/setup.cfg
@@ -23,8 +23,7 @@ install_requires =
     cloudpathlib[gs]>=0.15.1
     humanfriendly>=9.1
     more-itertools>=9.0.0
-    pandas==1.5.3
-    numpy==1.24.4 # Temporarily pin numpy due to https://github.com/numpy/numpy/issues/26710
+    pandas
     pydantic>=2.6.2
     pyyaml>=6.0.1
     tomli>=2.0.1

diff --git a/tests/cdp/helpers/test_s3_utils.py b/tests/cdp/helpers/test_s3_utils.py
@@ -3,6 +3,7 @@
 import json
 
 import boto3
+import pandas as pd
 import pytest
 from moto import mock_aws
 
@@ -24,6 +25,7 @@
     upload_folder,
     validate_bucket_name,
     validate_s3_file_path,
+    write_csv,
 )
 from rdsa_utils.exceptions import InvalidBucketNameError, InvalidS3FilePathError
 
@@ -1064,3 +1066,43 @@ def test_load_json_invalid_bucket_name(self, s3_client):
 
         with pytest.raises(InvalidBucketNameError):
             load_json(s3_client, "INVALID_BUCKET", "test-file.json")
+
+
+class TestWriteCSV:
+    """Tests for write_csv function."""
+
+    @pytest.fixture(scope="class")
+    def s3_client(self):
+        """Boto3 S3 client fixture for this test class."""
+        with mock_aws():
+            s3 = boto3.client("s3", region_name="us-east-1")
+            s3.create_bucket(Bucket="test-bucket")
+            yield s3
+
+    def test_write_csv_success(self, s3_client):
+        """Test that write_csv returns True if successful."""
+        data = {"name": ["John"], "age": [30], "city": ["Manchester"]}
+        df = pd.DataFrame(data)
+
+        result = write_csv(s3_client, "test-bucket", df, "test_file.csv")
+        assert result
+
+    def test_write_csv_read_back(self, s3_client):
+        """Test that a file wrtitten by write_csv can be read back and returns
+        the same dataframe as input. Uses kwargs.
+        """
+        data = {"name": ["John"], "age": [30], "city": ["Manchester"]}
+        df = pd.DataFrame(data)
+
+        _ = write_csv(s3_client, "test-bucket", df, "test_file.csv", index=False)
+        result = load_csv(s3_client, "test-bucket", "test_file.csv")
+        pd.testing.assert_frame_equal(df, result)
+
+    def test_write_csv_failure(self, s3_client):
+        """Test that write_csv returns False if unable to write.
+        Dictionary data does not have to_csv method.
+        """
+        data = {"name": ["John"], "age": [30], "city": ["Manchester"]}
+
+        result = write_csv(s3_client, "test-bucket", data, "test_file.csv", index=False)
+        assert not result