Merge pull request #115 from ONSdigital/development

Release 0.3.2
ONSdigital · Sep 2, 2024 · b0dc331 · b0dc331
2 parents bd6d295 + 05b169d
commit b0dc331
Show file tree

Hide file tree

Showing 13 changed files with 1,080 additions and 17 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.1
+current_version = 0.3.2
 commit = False
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)

diff --git a/.github/workflows/deploy_mkdocs.yaml b/.github/workflows/deploy_mkdocs.yaml
@@ -17,7 +17,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11"
 
       - name: Install dependencies
         run: |

diff --git a/.github/workflows/pull_request_workflow.yaml b/.github/workflows/pull_request_workflow.yaml
@@ -41,7 +41,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip3 install -e .[dev,doc]
+          pip3 install -e .[dev]
 
       - name: Run tests
         run: pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,30 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
 
 ### Removed
 
+## [v0.3.2] - 2024-09-02
+
+### Added
+- Added `load_csv` to `helpers/pyspark.py` with kwargs parameter.
+- Added `truncate_external_hive_table` to `helpers/pyspark.py`.
+- Added `get_tables_in_database` to `cdp/io/input.py`.
+- Added `load_csv` to `cdp/helpers/s3_utils.py`. This loads a CSV from S3 bucket
+  into a Pandas DataFrame.
+
+### Changed
+- Removed `.config("spark.shuffle.service.enabled", "true")` 
+  from `create_spark_session()` not compatible with CDP. Added
+  `.config("spark.dynamicAllocation.shuffleTracking.enabled", "true")` & 
+  `.config("spark.sql.adaptive.enabled", "true")`.
+- Change `mkdocs` theme from `mkdocs-tech-docs-template` to `ons-mkdocs-theme`.
+- Added more parameters to `load_and_validate_table()` in `cdp/io/input.py`.
+
+### Deprecated
+
+### Fixed
+- Temporarily pin `numpy==1.24.4` due to https://github.com/numpy/numpy/issues/267100
+
+### Removed
+
 ## [v0.3.1] - 2024-05-24
 
 ### Added
@@ -348,6 +372,8 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0
 > and GitHub Releases.
 
 
+- rdsa-utils v0.3.2: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.2) | 
+  [PyPI](https://pypi.org/project/rdsa-utils/0.3.2/)
 - rdsa-utils v0.3.1: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.1) | 
   [PyPI](https://pypi.org/project/rdsa-utils/0.3.1/)
 - rdsa-utils v0.3.0: [GitHub Release](https://github.com/ONSdigital/rdsa-utils/releases/tag/v0.3.0) | 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -1,13 +1,18 @@
 site_name: rdsa-utils Documentation
 
 theme:
-  name: tech_docs_template
+  name: ons_mkdocs_theme
   features:
     - navigation.tabs
     - navigation.tabs.sticky
-    - navigation.indexes
-  logo: assets/ons_logo_white.svg
-  favicon: assets/ons_favicon.svg
+    - navigation.sections
+    - toc.integrate
+    - content.tabs.link
+    - content.code.annotation
+    - content.code.copy
+  language: en
+  logo: assets/images/logo.svg
+  favicon: assets/images/favicon.ico
 
 repo_name: rdsa-utils
 repo_url: https://github.com/ONSdigital/rdsa-utils
@@ -41,3 +46,12 @@ nav:
   - API Reference: reference.md
   - Contribution Guide: contribution_guide.md
   - Branching & Deployment Guide: branch_and_deploy_guide.md
+
+extra:
+  social:
+    - icon: fontawesome/brands/github
+      link: <insert repo link>
+
+# Do not remove the copy right section. But you can change the copyright information.
+copyright: |
+  &copy;  <a href="https://www.ons.gov.uk">Office for National Statistics 2024</a>
diff --git a/rdsa_utils/__init__.py b/rdsa_utils/__init__.py
@@ -1 +1 @@
-__version__ = "0.3.1"
+__version__ = "0.3.2"
diff --git a/rdsa_utils/cdp/helpers/s3_utils.py b/rdsa_utils/cdp/helpers/s3_utils.py
@@ -24,9 +24,10 @@
 
 import logging
 from pathlib import Path
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import boto3
+import pandas as pd
 
 from rdsa_utils.exceptions import InvalidBucketNameError
 
@@ -815,3 +816,149 @@ def delete_folder(
             f"in bucket {bucket_name}: {str(e)}",
         )
         return False
+
+
+def load_csv(
+    client: boto3.client,
+    bucket_name: str,
+    filepath: str,
+    keep_columns: Optional[List[str]] = None,
+    rename_columns: Optional[Dict[str, str]] = None,
+    drop_columns: Optional[List[str]] = None,
+    **kwargs,
+) -> pd.DataFrame:
+    """Load a CSV file from an S3 bucket into a Pandas DataFrame.
+
+    Parameters
+    ----------
+    client
+        The boto3 S3 client instance.
+    bucket_name
+        The name of the S3 bucket.
+    filepath
+        The key (full path and filename) of the CSV file in the S3 bucket.
+    keep_columns
+        A list of column names to keep in the DataFrame, dropping all others.
+        Default value is None.
+    rename_columns
+        A dictionary to rename columns where keys are existing column
+        names and values are new column names.
+        Default value is None.
+    drop_columns
+        A list of column names to drop from the DataFrame.
+        Default value is None.
+    kwargs
+        Additional keyword arguments to pass to the `pd.read_csv` method.
+
+    Returns
+    -------
+    pd.DataFrame
+        Pandas DataFrame containing the data from the CSV file.
+
+    Raises
+    ------
+    Exception
+        If there is an error loading the file.
+    ValueError
+        If a column specified in rename_columns, drop_columns, or
+        keep_columns is not found in the DataFrame.
+
+    Notes
+    -----
+    Transformation order:
+    1. Columns are kept according to `keep_columns`.
+    2. Columns are dropped according to `drop_columns`.
+    3. Columns are renamed according to `rename_columns`.
+
+    Examples
+    --------
+    Load a CSV file and rename columns:
+
+    >>> df = load_csv(
+            client,
+            "my-bucket",
+            "path/to/file.csv",
+            rename_columns={"old_name": "new_name"}
+        )
+
+    Load a CSV file and keep only specific columns:
+
+    >>> df = load_csv(
+            client,
+            "my-bucket",
+            "path/to/file.csv",
+            keep_columns=["col1", "col2"]
+        )
+
+    Load a CSV file and drop specific columns:
+
+    >>> df = load_csv(
+            client,
+            "my-bucket",
+            "path/to/file.csv",
+            drop_columns=["col1", "col2"]
+        )
+
+    Load a CSV file with custom delimiter:
+
+    >>> df = load_csv(
+            client,
+            "my-bucket",
+            "path/to/file.csv",
+            sep=";"
+        )
+    """
+    try:
+        # Get the CSV file from S3
+        response = client.get_object(Bucket=bucket_name, Key=filepath)
+        logger.info(
+            f"Loaded CSV file from S3 bucket {bucket_name}, filepath {filepath}",
+        )
+
+        # Read the CSV file into a Pandas DataFrame
+        df = pd.read_csv(response["Body"], **kwargs)
+
+    except Exception as e:
+        error_message = (
+            f"Error loading file from bucket {bucket_name}, filepath {filepath}: {e}"
+        )
+        logger.error(error_message)
+        raise Exception(error_message) from e
+
+    columns = df.columns.tolist()
+
+    # Apply column transformations: keep, drop, rename
+    if keep_columns:
+        missing_columns = [col for col in keep_columns if col not in columns]
+        if missing_columns:
+            error_message = (
+                f"Columns {missing_columns} not found in DataFrame and cannot be kept"
+            )
+            logger.error(error_message)
+            raise ValueError(error_message)
+        df = df[keep_columns]
+
+    if drop_columns:
+        for col in drop_columns:
+            if col in columns:
+                df = df.drop(columns=[col])
+            else:
+                error_message = (
+                    f"Column '{col}' not found in DataFrame and cannot be dropped"
+                )
+                logger.error(error_message)
+                raise ValueError(error_message)
+
+    if rename_columns:
+        for old_name, new_name in rename_columns.items():
+            if old_name in columns:
+                df = df.rename(columns={old_name: new_name})
+            else:
+                error_message = (
+                    f"Column '{old_name}' not found in DataFrame and "
+                    f"cannot be renamed to '{new_name}'"
+                )
+                logger.error(error_message)
+                raise ValueError(error_message)
+
+    return df