Skip to content

Commit

Permalink
Merge branch 'development' into requirements_update
Browse files Browse the repository at this point in the history
  • Loading branch information
dombean authored Nov 26, 2024
2 parents b820cdb + 0cecc6a commit 93709b4
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.6
current_version = 0.4.1
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
Expand Down
48 changes: 47 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,47 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0

### Removed

## [0.4.1] - 2024-11-25

### Added

### Changed
- Updated the `ons-mkdocs-theme` version number in `doc` requirements in `setup.cfg`.

### Deprecated

### Fixed

### Removed

## [0.4.0] - 2024-11-21

### Added

### Changed
- Unpinned `pandas` version in `setup.cfg` to allow for more flexibility
in dependency management.
- Removed `numpy` from `setup.cfg` as it will be installed automatically by `pandas`.

### Deprecated

### Fixed

### Removed

## [v0.3.7] - 2024-11-20

### Added
- Added `write_csv` function inside `cdp/helpers/s3_utils.py`.

### Changed

### Deprecated

### Fixed

### Removed

## [v0.3.6] - 2024-10-16

### Added
Expand Down Expand Up @@ -434,7 +475,12 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
> due to bugs in the GitHub Action `deploy_pypi.yaml`, which deploys to PyPI
> and GitHub Releases.

- rdsa-utils v0.4.1: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.4.1) |
[PyPI](https://pypi.org/project/rdsa-utils/0.4.1/)
- rdsa-utils v0.4.0: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.4.0) |
[PyPI](https://pypi.org/project/rdsa-utils/0.4.0/)
- rdsa-utils v0.3.7: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.7) |
[PyPI](https://pypi.org/project/rdsa-utils/0.3.7/)
- rdsa-utils v0.3.6: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.6) |
[PyPI](https://pypi.org/project/rdsa-utils/0.3.6/)
- rdsa-utils v0.3.5: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.5) |
Expand Down
2 changes: 1 addition & 1 deletion rdsa_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.6"
__version__ = "0.4.1"
86 changes: 82 additions & 4 deletions rdsa_utils/cdp/helpers/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
and SSL certificate, you can use the following code snippet:
```python
import boto3
import raz_client
Expand All @@ -24,6 +25,7 @@

import json
import logging
from io import StringIO
from pathlib import Path
from typing import Dict, List, Optional

Expand Down Expand Up @@ -1042,13 +1044,13 @@ def load_json(
Parameters
----------
client : boto3.client
client
The boto3 S3 client instance.
bucket_name : str
bucket_name
The name of the S3 bucket.
filepath : str
filepath
The key (full path and filename) of the JSON file in the S3 bucket.
encoding : str, optional
encoding
The encoding of the JSON file. Default is 'utf-8'.
Returns
Expand Down Expand Up @@ -1098,3 +1100,79 @@ def load_json(
raise Exception(error_message) from e

return data


def write_csv(
client: boto3.client,
bucket_name: str,
data: pd.DataFrame,
filepath: str,
**kwargs,
) -> bool:
"""Write a Pandas Dataframe to csv in an S3 bucket.
Uses StringIO library as a RAM buffer, so at first Pandas writes data to the
buffer, then the buffer returns to the beginning, and then it is sent to
the S3 bucket using the boto3.put_object method.
Parameters
----------
client
The boto3 S3 client instance.
bucket_name
The name of the S3 bucket.
data
The dataframe to write to the spexified path.
filepath
The filepath to save the dataframe to.
kwargs
Optional dictionary of Pandas to_csv arguments.
Returns
-------
bool
True if the dataframe is written successfully.
False if it was not possible to serialise or write the file.
Raises
------
Exception
If there is an error writing the file to S3.
Examples
--------
>>> s3_client = boto3.client('s3')
>>> data = pd.DataFrame({
>>> 'column1': [1, 2, 3],
>>> 'column2': ['a', 'b', 'c']
>>> })
>>> write_csv(s3_client, 'my_bucket', data, 'path/to/file.csv')
True
"""
try:
# Create an Input-Output buffer
csv_buffer = StringIO()

# Write the dataframe to the buffer in the CSV format
data.to_csv(csv_buffer, **kwargs)

# "Rewind" the stream to the start of the buffer
csv_buffer.seek(0)

# Write the buffer into the s3 bucket. Assign the output to a mute
# variable, so the output is not displayed in the console or log.
_ = client.put_object(
Bucket=bucket_name,
Body=csv_buffer.getvalue(),
Key=filepath,
)
logger.info(f"Successfully wrote dataframe to {bucket_name}/{filepath}")
return True

except Exception as e:
error_message = (
f"Error writing to csv or saving to bucket {bucket_name}, "
f"filepath {filepath}: {e}"
)
logger.error(error_message)
return False
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ install_requires =
cloudpathlib[gs]>=0.15.1
humanfriendly>=9.1
more-itertools>=9.0.0
pandas==1.5.3
numpy==1.24.4 # Temporarily pin numpy due to https://github.com/numpy/numpy/issues/26710
pandas
pydantic>=2.6.2
pyyaml>=6.0.1
tomli>=2.0.1
Expand Down
42 changes: 42 additions & 0 deletions tests/cdp/helpers/test_s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json

import boto3
import pandas as pd
import pytest
from moto import mock_aws

Expand All @@ -24,6 +25,7 @@
upload_folder,
validate_bucket_name,
validate_s3_file_path,
write_csv,
)
from rdsa_utils.exceptions import InvalidBucketNameError, InvalidS3FilePathError

Expand Down Expand Up @@ -1064,3 +1066,43 @@ def test_load_json_invalid_bucket_name(self, s3_client):

with pytest.raises(InvalidBucketNameError):
load_json(s3_client, "INVALID_BUCKET", "test-file.json")


class TestWriteCSV:
"""Tests for write_csv function."""

@pytest.fixture(scope="class")
def s3_client(self):
"""Boto3 S3 client fixture for this test class."""
with mock_aws():
s3 = boto3.client("s3", region_name="us-east-1")
s3.create_bucket(Bucket="test-bucket")
yield s3

def test_write_csv_success(self, s3_client):
"""Test that write_csv returns True if successful."""
data = {"name": ["John"], "age": [30], "city": ["Manchester"]}
df = pd.DataFrame(data)

result = write_csv(s3_client, "test-bucket", df, "test_file.csv")
assert result

def test_write_csv_read_back(self, s3_client):
"""Test that a file wrtitten by write_csv can be read back and returns
the same dataframe as input. Uses kwargs.
"""
data = {"name": ["John"], "age": [30], "city": ["Manchester"]}
df = pd.DataFrame(data)

_ = write_csv(s3_client, "test-bucket", df, "test_file.csv", index=False)
result = load_csv(s3_client, "test-bucket", "test_file.csv")
pd.testing.assert_frame_equal(df, result)

def test_write_csv_failure(self, s3_client):
"""Test that write_csv returns False if unable to write.
Dictionary data does not have to_csv method.
"""
data = {"name": ["John"], "age": [30], "city": ["Manchester"]}

result = write_csv(s3_client, "test-bucket", data, "test_file.csv", index=False)
assert not result

0 comments on commit 93709b4

Please sign in to comment.