Skip to content

Commit

Permalink
Merge pull request #115 from ONSdigital/development
Browse files Browse the repository at this point in the history
Release 0.3.2
  • Loading branch information
dombean authored Sep 2, 2024
2 parents bd6d295 + 05b169d commit b0dc331
Show file tree
Hide file tree
Showing 13 changed files with 1,080 additions and 17 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.1
current_version = 0.3.2
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/deploy_mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.11"

- name: Install dependencies
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull_request_workflow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip3 install -e .[dev,doc]
pip3 install -e .[dev]
- name: Run tests
run: pytest
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,30 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0

### Removed

## [v0.3.2] - 2024-09-02

### Added
- Added `load_csv` to `helpers/pyspark.py` with kwargs parameter.
- Added `truncate_external_hive_table` to `helpers/pyspark.py`.
- Added `get_tables_in_database` to `cdp/io/input.py`.
- Added `load_csv` to `cdp/helpers/s3_utils.py`. This loads a CSV from S3 bucket
into a Pandas DataFrame.

### Changed
- Removed `.config("spark.shuffle.service.enabled", "true")`
from `create_spark_session()` not compatible with CDP. Added
`.config("spark.dynamicAllocation.shuffleTracking.enabled", "true")` &
`.config("spark.sql.adaptive.enabled", "true")`.
- Change `mkdocs` theme from `mkdocs-tech-docs-template` to `ons-mkdocs-theme`.
- Added more parameters to `load_and_validate_table()` in `cdp/io/input.py`.

### Deprecated

### Fixed
- Temporarily pin `numpy==1.24.4` due to https://github.com/numpy/numpy/issues/267100

### Removed

## [v0.3.1] - 2024-05-24

### Added
Expand Down Expand Up @@ -348,6 +372,8 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
> and GitHub Releases.

- rdsa-utils v0.3.2: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.2) |
[PyPI](https://pypi.org/project/rdsa-utils/0.3.2/)
- rdsa-utils v0.3.1: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.1) |
[PyPI](https://pypi.org/project/rdsa-utils/0.3.1/)
- rdsa-utils v0.3.0: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.0) |
Expand Down
22 changes: 18 additions & 4 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
site_name: rdsa-utils Documentation

theme:
name: tech_docs_template
name: ons_mkdocs_theme
features:
- navigation.tabs
- navigation.tabs.sticky
- navigation.indexes
logo: assets/ons_logo_white.svg
favicon: assets/ons_favicon.svg
- navigation.sections
- toc.integrate
- content.tabs.link
- content.code.annotation
- content.code.copy
language: en
logo: assets/images/logo.svg
favicon: assets/images/favicon.ico

repo_name: rdsa-utils
repo_url: https://github.com/ONSdigital/rdsa-utils
Expand Down Expand Up @@ -41,3 +46,12 @@ nav:
- API Reference: reference.md
- Contribution Guide: contribution_guide.md
- Branching & Deployment Guide: branch_and_deploy_guide.md

extra:
social:
- icon: fontawesome/brands/github
link: <insert repo link>

# Do not remove the copy right section. But you can change the copyright information.
copyright: |
&copy; <a href="https://www.ons.gov.uk">Office for National Statistics 2024</a>
2 changes: 1 addition & 1 deletion rdsa_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.1"
__version__ = "0.3.2"
149 changes: 148 additions & 1 deletion rdsa_utils/cdp/helpers/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@

import logging
from pathlib import Path
from typing import List, Optional
from typing import Dict, List, Optional

import boto3
import pandas as pd

from rdsa_utils.exceptions import InvalidBucketNameError

Expand Down Expand Up @@ -815,3 +816,149 @@ def delete_folder(
f"in bucket {bucket_name}: {str(e)}",
)
return False


def load_csv(
client: boto3.client,
bucket_name: str,
filepath: str,
keep_columns: Optional[List[str]] = None,
rename_columns: Optional[Dict[str, str]] = None,
drop_columns: Optional[List[str]] = None,
**kwargs,
) -> pd.DataFrame:
"""Load a CSV file from an S3 bucket into a Pandas DataFrame.
Parameters
----------
client
The boto3 S3 client instance.
bucket_name
The name of the S3 bucket.
filepath
The key (full path and filename) of the CSV file in the S3 bucket.
keep_columns
A list of column names to keep in the DataFrame, dropping all others.
Default value is None.
rename_columns
A dictionary to rename columns where keys are existing column
names and values are new column names.
Default value is None.
drop_columns
A list of column names to drop from the DataFrame.
Default value is None.
kwargs
Additional keyword arguments to pass to the `pd.read_csv` method.
Returns
-------
pd.DataFrame
Pandas DataFrame containing the data from the CSV file.
Raises
------
Exception
If there is an error loading the file.
ValueError
If a column specified in rename_columns, drop_columns, or
keep_columns is not found in the DataFrame.
Notes
-----
Transformation order:
1. Columns are kept according to `keep_columns`.
2. Columns are dropped according to `drop_columns`.
3. Columns are renamed according to `rename_columns`.
Examples
--------
Load a CSV file and rename columns:
>>> df = load_csv(
client,
"my-bucket",
"path/to/file.csv",
rename_columns={"old_name": "new_name"}
)
Load a CSV file and keep only specific columns:
>>> df = load_csv(
client,
"my-bucket",
"path/to/file.csv",
keep_columns=["col1", "col2"]
)
Load a CSV file and drop specific columns:
>>> df = load_csv(
client,
"my-bucket",
"path/to/file.csv",
drop_columns=["col1", "col2"]
)
Load a CSV file with custom delimiter:
>>> df = load_csv(
client,
"my-bucket",
"path/to/file.csv",
sep=";"
)
"""
try:
# Get the CSV file from S3
response = client.get_object(Bucket=bucket_name, Key=filepath)
logger.info(
f"Loaded CSV file from S3 bucket {bucket_name}, filepath {filepath}",
)

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(response["Body"], **kwargs)

except Exception as e:
error_message = (
f"Error loading file from bucket {bucket_name}, filepath {filepath}: {e}"
)
logger.error(error_message)
raise Exception(error_message) from e

columns = df.columns.tolist()

# Apply column transformations: keep, drop, rename
if keep_columns:
missing_columns = [col for col in keep_columns if col not in columns]
if missing_columns:
error_message = (
f"Columns {missing_columns} not found in DataFrame and cannot be kept"
)
logger.error(error_message)
raise ValueError(error_message)
df = df[keep_columns]

if drop_columns:
for col in drop_columns:
if col in columns:
df = df.drop(columns=[col])
else:
error_message = (
f"Column '{col}' not found in DataFrame and cannot be dropped"
)
logger.error(error_message)
raise ValueError(error_message)

if rename_columns:
for old_name, new_name in rename_columns.items():
if old_name in columns:
df = df.rename(columns={old_name: new_name})
else:
error_message = (
f"Column '{old_name}' not found in DataFrame and "
f"cannot be renamed to '{new_name}'"
)
logger.error(error_message)
raise ValueError(error_message)

return df
Loading

0 comments on commit b0dc331

Please sign in to comment.