diff --git a/.github/ISSUE_TEMPLATE/amos_feature_request.md b/.github/ISSUE_TEMPLATE/amos_feature_request.md deleted file mode 100644 index 656f4751d..000000000 --- a/.github/ISSUE_TEMPLATE/amos_feature_request.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: AMOS Feature request -about: A feature request following the AMOS project guidelines -title: '' -labels: '' -assignees: '' - ---- -## User Story - -1. As a ... (Who?) -2. I want to ... (What?) -3. So that ... (Why?) - -## Additional context - -Add any other context or screenshots about the feature request here. - -## Acceptance Criteria - -- **Add acceptance criteria here** - -## Definition of Done - -- Test cases have been created and are runnin successfully -- Documentation for the new component was added -- Github Actions are running without errors - diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 82eab7a09..cc272246e 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -25,8 +25,6 @@ jobs: uses: rtdip/core/.github/workflows/test.yml@develop job_run_unit_tests_and_sonarqube: - # TODO: Remove in Merge to main project - # uses: amosproj/amos2024ws01-rtdip-data-quality-checker/.github/workflows/sonarcloud_reusable.yml@hotfix/pipeline-moto uses: rtdip/core/.github/workflows/sonarcloud_reusable.yml@develop with: REPO_NAME: "" diff --git a/.gitignore b/.gitignore index 94cd7d594..d7f045ff8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ __pycache__/ *.py[cod] *$py.class +# AMOS project specific folders +Deliverables/ +amos_feature_request.md + # C extensions *.so diff --git a/Deliverables/sprint-01/.gitkeep b/Deliverables/sprint-01/.gitkeep deleted file mode 100644 index 8b1378917..000000000 --- a/Deliverables/sprint-01/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/Deliverables/sprint-01/feature-board.png b/Deliverables/sprint-01/feature-board.png deleted file mode 100644 index dbe7c6102..000000000 Binary files a/Deliverables/sprint-01/feature-board.png and /dev/null differ diff --git a/Deliverables/sprint-01/feature-board.tsv b/Deliverables/sprint-01/feature-board.tsv deleted file mode 100644 index d8a9ae7f1..000000000 --- a/Deliverables/sprint-01/feature-board.tsv +++ /dev/null @@ -1,9 +0,0 @@ -Title URL Assignees Status Estimated size Real size -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Product Backlog -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Product Backlog -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 Product Backlog -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 Product Backlog -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 Product Backlog -[Component] Trend identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/5 Product Backlog -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 Product Backlog -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 Product Backlog \ No newline at end of file diff --git a/Deliverables/sprint-01/imp-squared-backlog.jpg b/Deliverables/sprint-01/imp-squared-backlog.jpg deleted file mode 100644 index 538d9e7b7..000000000 Binary files a/Deliverables/sprint-01/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-01/imp-squared-backlog.tsv b/Deliverables/sprint-01/imp-squared-backlog.tsv deleted file mode 100644 index 6d15b7eb2..000000000 --- a/Deliverables/sprint-01/imp-squared-backlog.tsv +++ /dev/null @@ -1 +0,0 @@ -Title Assignees Status \ No newline at end of file diff --git a/Deliverables/sprint-01/planning-documents.pdf b/Deliverables/sprint-01/planning-documents.pdf deleted file mode 100644 index 9460d0357..000000000 Binary files a/Deliverables/sprint-01/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-01/team-logo.png b/Deliverables/sprint-01/team-logo.png deleted file mode 100644 index 9cb6d3820..000000000 Binary files a/Deliverables/sprint-01/team-logo.png and /dev/null differ diff --git a/Deliverables/sprint-01/team-logo.svg b/Deliverables/sprint-01/team-logo.svg deleted file mode 100644 index f19f860cd..000000000 --- a/Deliverables/sprint-01/team-logo.svg +++ /dev/null @@ -1,116 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Deliverables/sprint-02/feature_board.jpg b/Deliverables/sprint-02/feature_board.jpg deleted file mode 100644 index 7a7c93045..000000000 Binary files a/Deliverables/sprint-02/feature_board.jpg and /dev/null differ diff --git a/Deliverables/sprint-02/feature_board.tsv b/Deliverables/sprint-02/feature_board.tsv deleted file mode 100644 index 4994bd6fe..000000000 --- a/Deliverables/sprint-02/feature_board.tsv +++ /dev/null @@ -1,18 +0,0 @@ -Title URL Assignees Status Estimated size Real size -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Awaiting Review -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Awaiting Review 8 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Awaiting Review 1 -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle In Progress -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 In Progress 3 -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 In Progress -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 In Progress 5 -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 -Develop a test pipeline to run during release Product Backlog -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Product Backlog -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Product Backlog -[Component] Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog -[Component] Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 Product Backlog -[Component] Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Product Backlog -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Product Backlog -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Product Backlog \ No newline at end of file diff --git a/Deliverables/sprint-02/imp-squared-backlog.jpg b/Deliverables/sprint-02/imp-squared-backlog.jpg deleted file mode 100644 index 2b77d6890..000000000 Binary files a/Deliverables/sprint-02/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-02/imp-squared-backlog.tsv b/Deliverables/sprint-02/imp-squared-backlog.tsv deleted file mode 100644 index 3fc247f1b..000000000 --- a/Deliverables/sprint-02/imp-squared-backlog.tsv +++ /dev/null @@ -1,6 +0,0 @@ -Title Assignees Status -Slack workspace (Avi) In Progress -Get to know expectations and requirements from Industry Partner In Progress -Homework no assigned clearly - now assigned In Progress -SD Meeting In Progress -Figure out pipeline bug - for everyone Todo \ No newline at end of file diff --git a/Deliverables/sprint-02/planning-documents.pdf b/Deliverables/sprint-02/planning-documents.pdf deleted file mode 100644 index 0b519b088..000000000 Binary files a/Deliverables/sprint-02/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-02/rtdip-sdk-sbom.md b/Deliverables/sprint-02/rtdip-sdk-sbom.md deleted file mode 100644 index 11727ac14..000000000 --- a/Deliverables/sprint-02/rtdip-sdk-sbom.md +++ /dev/null @@ -1,76 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [29.10.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|---------------------------------|-----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | - - -### Summary -- **Total Components**: 56 -- **Last Updated**: [ 29.10.2024] diff --git a/Deliverables/sprint-02/software-architecture.pdf b/Deliverables/sprint-02/software-architecture.pdf deleted file mode 100644 index 51a4a60d1..000000000 Binary files a/Deliverables/sprint-02/software-architecture.pdf and /dev/null differ diff --git a/Deliverables/sprint-03/feature-board.jpg b/Deliverables/sprint-03/feature-board.jpg deleted file mode 100644 index 401e0492c..000000000 Binary files a/Deliverables/sprint-03/feature-board.jpg and /dev/null differ diff --git a/Deliverables/sprint-03/feature-board.tsv b/Deliverables/sprint-03/feature-board.tsv deleted file mode 100644 index 754668937..000000000 --- a/Deliverables/sprint-03/feature-board.tsv +++ /dev/null @@ -1,24 +0,0 @@ -Title URL Assignees Status Estimated size Real size Labels -Create a test pipeline to run during release https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/24 FelipeTrost In Progress 5 -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Awaiting Review 3 enhancement -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle Awaiting Review 8 enhancement -[Component] Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 kristen149, Timm638 In Progress 8 -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 chris-1187 In Progress 5 -[Component] Clean data based on Interval/Pattern https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/22 chris-1187, dh1542 In Progress -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 Feature Archive 3 5 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Feature Archive 1 1 -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 bug -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Feature Archive 8 8 enhancement -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Feature Archive duplicate -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Feature Archive good first issue -Time series prediction with linear regression https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/28 Product Backlog component -Time Series prediction using ARIMA https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/29 Product Backlog component -Store monitoring outputs in a standardized format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/26 Product Backlog enhancement -Validation of value ranges https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/31 Product Backlog component -Advanced Duplicate Detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/30 Product Backlog component -Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog component -Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Product Backlog component -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive documentation -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Feature Archive -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Feature Archive -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Feature Archive \ No newline at end of file diff --git a/Deliverables/sprint-03/imp-squared-backlog.jpg b/Deliverables/sprint-03/imp-squared-backlog.jpg deleted file mode 100644 index fe18f9ab2..000000000 Binary files a/Deliverables/sprint-03/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-03/imp-squared-backlog.tsv b/Deliverables/sprint-03/imp-squared-backlog.tsv deleted file mode 100644 index 3fc247f1b..000000000 --- a/Deliverables/sprint-03/imp-squared-backlog.tsv +++ /dev/null @@ -1,6 +0,0 @@ -Title Assignees Status -Slack workspace (Avi) In Progress -Get to know expectations and requirements from Industry Partner In Progress -Homework no assigned clearly - now assigned In Progress -SD Meeting In Progress -Figure out pipeline bug - for everyone Todo \ No newline at end of file diff --git a/Deliverables/sprint-03/planning-documents.pdf b/Deliverables/sprint-03/planning-documents.pdf deleted file mode 100644 index 7b62ce61d..000000000 Binary files a/Deliverables/sprint-03/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-03/rtdip-sdk-sbom.md b/Deliverables/sprint-03/rtdip-sdk-sbom.md deleted file mode 100644 index b97351fc2..000000000 --- a/Deliverables/sprint-03/rtdip-sdk-sbom.md +++ /dev/null @@ -1,87 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [05.11.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|--------------------------------|----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | -| 57 | python | >=3.9,<3.12 | Python Software Foundation | PSF | Python programming language | -| 58 | pyodbc | >=4.0.39,<5.0.0 | N/A | MIT | ODBC library for Python | -| 59 | twine | ==4.0.2 | PyPA | Apache 2.0 | Python package publishing tool | -| 60 | black | >=24.1.0 | Python Software Foundation | MIT | Code formatter for Python | -| 61 | great-expectations | >=0.18.8,<1.0.0 | N/A | Apache 2.0 | Data validation tool | -| 62 | azure-functions | >=1.15.0,<2.0.0 | Microsoft | MIT | Functions for Azure services | -| 63 | build | ==0.10.0 | PyPA | MIT | Python package build tool | -| 64 | deltalake | >=0.10.1,<1.0.0 | Delta, Inc. | Apache 2.0 | Delta Lake interaction for Python | -| 65 | trio | >=0.22.1 | Python Software Foundation | MIT | Async library for concurrency | -| 66 | eth-typing | >=4.2.3,<5.0.0 | Ethereum Foundation | MIT | Ethereum types library | -| 67 | moto[s3] | >=5.0.16,<6.0.0 | Spulec | Apache 2.0 | Mock library for AWS S3 | -| 68 | pyarrow | >=14.0.1,<17.0.0 | Apache Arrow | Apache 2.0 | Columnar data storage and processing | - -### Summary -- **Total Components**: 68 -- **Last Updated**: [05.11.2024] diff --git a/Deliverables/sprint-04/Screenshot 2024-11-13 130226.png b/Deliverables/sprint-04/Screenshot 2024-11-13 130226.png deleted file mode 100644 index e6bb876fe..000000000 Binary files a/Deliverables/sprint-04/Screenshot 2024-11-13 130226.png and /dev/null differ diff --git a/Deliverables/sprint-04/Screenshot 2024-11-13 130453.png b/Deliverables/sprint-04/Screenshot 2024-11-13 130453.png deleted file mode 100644 index 4cee2b492..000000000 Binary files a/Deliverables/sprint-04/Screenshot 2024-11-13 130453.png and /dev/null differ diff --git a/Deliverables/sprint-04/amos2024ws01-feature-board - Board.tsv b/Deliverables/sprint-04/amos2024ws01-feature-board - Board.tsv deleted file mode 100644 index ef525c90c..000000000 --- a/Deliverables/sprint-04/amos2024ws01-feature-board - Board.tsv +++ /dev/null @@ -1,32 +0,0 @@ -Title URL Assignees Status Estimated size Real size Labels -Time Series prediction using ARIMA https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/29 Timm638 Feature Archive 13 8 component -Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 kristen149, Timm638 Feature Archive 8 8 component -Clean data based on Interval/Pattern https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/22 dh1542 Feature Archive 8 8 component -Create a test pipeline to run during release https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/24 FelipeTrost Feature Archive 5 1 -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle Feature Archive 8 8 enhancement -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 chris-1187 Feature Archive 5 5 -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Feature Archive 3 8 enhancement -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 Feature Archive 3 5 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Feature Archive 1 1 -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 3 bug -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Feature Archive 8 8 enhancement -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Feature Archive duplicate -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Feature Archive good first issue -Time series prediction with linear regression https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/28 FelipeTrost, kristen149 Product Backlog 8 component -Missing value imputation https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/36 chris-1187 Product Backlog 13 component -Store monitoring outputs in a standardized format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/26 dh1542 Product Backlog 13 enhancement -Validation of value ranges https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/31 mollle Product Backlog 3 component -Finish integrating ARIMA functionality of statsmodels into RTDIP https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/40 Product Backlog component -Reduce number of parameters needed to use ArimaPrediction effectively https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/41 Product Backlog component -Prepare RTDIP demo https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/43 Product Backlog -Advanced Duplicate Detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/30 Product Backlog component -Flatline detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/44 Product Backlog component -One-Hot Encoding https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/45 Product Backlog component -Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog component -Data Binning https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/46 Product Backlog component -Interval Screening and Missing Entry Insertion https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/47 Product Backlog component -Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Feature Archive component -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive documentation -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Feature Archive -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Feature Archive -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Feature Archive \ No newline at end of file diff --git a/Deliverables/sprint-04/imp-squared-backlog.tsv b/Deliverables/sprint-04/imp-squared-backlog.tsv deleted file mode 100644 index 6b8507ae8..000000000 --- a/Deliverables/sprint-04/imp-squared-backlog.tsv +++ /dev/null @@ -1,10 +0,0 @@ -Title Assignees Status -Slack workspace (Avi) Done -SD Meeting Done -Homework no assigned clearly - now assigned Done -Figure out pipeline bug - for everyone Done -Get to know expectations and requirements from Industry Partner In Progress -Coordinated PR Reviews Todo -Make sure everyone can run the product (for example via readme doc) Todo -No expericence in ML Todo -Long term planning such as: (notes in description) Todo \ No newline at end of file diff --git a/Deliverables/sprint-04/planning-documents.pdf b/Deliverables/sprint-04/planning-documents.pdf deleted file mode 100644 index 2d3a43501..000000000 Binary files a/Deliverables/sprint-04/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-04/rtdip-sdk-sbom.md b/Deliverables/sprint-04/rtdip-sdk-sbom.md deleted file mode 100644 index b97351fc2..000000000 --- a/Deliverables/sprint-04/rtdip-sdk-sbom.md +++ /dev/null @@ -1,87 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [05.11.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|--------------------------------|----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | -| 57 | python | >=3.9,<3.12 | Python Software Foundation | PSF | Python programming language | -| 58 | pyodbc | >=4.0.39,<5.0.0 | N/A | MIT | ODBC library for Python | -| 59 | twine | ==4.0.2 | PyPA | Apache 2.0 | Python package publishing tool | -| 60 | black | >=24.1.0 | Python Software Foundation | MIT | Code formatter for Python | -| 61 | great-expectations | >=0.18.8,<1.0.0 | N/A | Apache 2.0 | Data validation tool | -| 62 | azure-functions | >=1.15.0,<2.0.0 | Microsoft | MIT | Functions for Azure services | -| 63 | build | ==0.10.0 | PyPA | MIT | Python package build tool | -| 64 | deltalake | >=0.10.1,<1.0.0 | Delta, Inc. | Apache 2.0 | Delta Lake interaction for Python | -| 65 | trio | >=0.22.1 | Python Software Foundation | MIT | Async library for concurrency | -| 66 | eth-typing | >=4.2.3,<5.0.0 | Ethereum Foundation | MIT | Ethereum types library | -| 67 | moto[s3] | >=5.0.16,<6.0.0 | Spulec | Apache 2.0 | Mock library for AWS S3 | -| 68 | pyarrow | >=14.0.1,<17.0.0 | Apache Arrow | Apache 2.0 | Columnar data storage and processing | - -### Summary -- **Total Components**: 68 -- **Last Updated**: [05.11.2024] diff --git a/Deliverables/sprint-05/build-process-video.mkv b/Deliverables/sprint-05/build-process-video.mkv deleted file mode 100644 index 307169130..000000000 Binary files a/Deliverables/sprint-05/build-process-video.mkv and /dev/null differ diff --git a/Deliverables/sprint-05/feature-board.jpg b/Deliverables/sprint-05/feature-board.jpg deleted file mode 100644 index 7b97c2c6f..000000000 Binary files a/Deliverables/sprint-05/feature-board.jpg and /dev/null differ diff --git a/Deliverables/sprint-05/feature-board.tsv b/Deliverables/sprint-05/feature-board.tsv deleted file mode 100644 index e4b9d785d..000000000 --- a/Deliverables/sprint-05/feature-board.tsv +++ /dev/null @@ -1,32 +0,0 @@ -Title URL Assignees Status Estimated size Real size Labels -Time series prediction with linear regression https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/28 FelipeTrost, kristen149 Awaiting Review 8 component -Missing value imputation https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/36 chris-1187 Awaiting Review 13 component -Validation of value ranges https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/31 mollle Awaiting Review 3 component -Time Series prediction using ARIMA https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/29 Timm638 Awaiting Review 13 8 component -Flatline detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/44 mollle Awaiting Review component -Advanced Duplicate Detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/30 mollle In Progress component -One-Hot Encoding https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/45 kristen149 In Progress component -Prepare RTDIP demo https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/43 Timm638 In Progress 8 -Store monitoring outputs in a standardized format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/26 dh1542 In Progress 13 enhancement -Finish integrating ARIMA functionality of statsmodels into RTDIP https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/40 Sprint Backlog 5 component -Reduce number of parameters needed to use ArimaPrediction effectively https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/41 chris-1187 Sprint Backlog 8 component -Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 kristen149, Timm638 Feature Archive 8 8 component -Clean data based on Interval/Pattern https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/22 dh1542 Feature Archive 8 8 component -Create a test pipeline to run during release https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/24 FelipeTrost Feature Archive 5 1 -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle Feature Archive 8 8 enhancement -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 chris-1187 Feature Archive 5 5 -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Feature Archive 3 8 enhancement -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 Feature Archive 3 5 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Feature Archive 1 1 -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 3 bug -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Feature Archive 8 8 enhancement -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Feature Archive duplicate -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Feature Archive good first issue -Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog component -Data Binning https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/46 Product Backlog component -Interval Screening and Missing Entry Insertion https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/47 Product Backlog component -Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Feature Archive component -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive documentation -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Feature Archive -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Feature Archive -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Feature Archive \ No newline at end of file diff --git a/Deliverables/sprint-05/imp-squared-backlog.jpg b/Deliverables/sprint-05/imp-squared-backlog.jpg deleted file mode 100644 index 2b4a02aa5..000000000 Binary files a/Deliverables/sprint-05/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-05/imp-squared-backlog.tsv b/Deliverables/sprint-05/imp-squared-backlog.tsv deleted file mode 100644 index 4e21cef7a..000000000 --- a/Deliverables/sprint-05/imp-squared-backlog.tsv +++ /dev/null @@ -1,12 +0,0 @@ -Title Assignees Status -Coordinated PR Reviews In Progress -Make sure everyone can run the product (for example via readme doc) In Progress -No expericence in ML In Progress -Discuss with industry partner an optimal time for the meeting to take place In Progress -Make sure the team meeting ends at 14:00 In Progress -Slack workspace (Avi) Done -SD Meeting Done -Homework no assigned clearly - now assigned Done -Figure out pipeline bug - for everyone Done -Get to know expectations and requirements from Industry Partner Done -Long term planning such as: (notes in description) Todo \ No newline at end of file diff --git a/Deliverables/sprint-05/planning-documents.pdf b/Deliverables/sprint-05/planning-documents.pdf deleted file mode 100644 index 3fc736057..000000000 Binary files a/Deliverables/sprint-05/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-05/rtdip-sdk-sbom.md b/Deliverables/sprint-05/rtdip-sdk-sbom.md deleted file mode 100644 index b97351fc2..000000000 --- a/Deliverables/sprint-05/rtdip-sdk-sbom.md +++ /dev/null @@ -1,87 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [05.11.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|--------------------------------|----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | -| 57 | python | >=3.9,<3.12 | Python Software Foundation | PSF | Python programming language | -| 58 | pyodbc | >=4.0.39,<5.0.0 | N/A | MIT | ODBC library for Python | -| 59 | twine | ==4.0.2 | PyPA | Apache 2.0 | Python package publishing tool | -| 60 | black | >=24.1.0 | Python Software Foundation | MIT | Code formatter for Python | -| 61 | great-expectations | >=0.18.8,<1.0.0 | N/A | Apache 2.0 | Data validation tool | -| 62 | azure-functions | >=1.15.0,<2.0.0 | Microsoft | MIT | Functions for Azure services | -| 63 | build | ==0.10.0 | PyPA | MIT | Python package build tool | -| 64 | deltalake | >=0.10.1,<1.0.0 | Delta, Inc. | Apache 2.0 | Delta Lake interaction for Python | -| 65 | trio | >=0.22.1 | Python Software Foundation | MIT | Async library for concurrency | -| 66 | eth-typing | >=4.2.3,<5.0.0 | Ethereum Foundation | MIT | Ethereum types library | -| 67 | moto[s3] | >=5.0.16,<6.0.0 | Spulec | Apache 2.0 | Mock library for AWS S3 | -| 68 | pyarrow | >=14.0.1,<17.0.0 | Apache Arrow | Apache 2.0 | Columnar data storage and processing | - -### Summary -- **Total Components**: 68 -- **Last Updated**: [05.11.2024] diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py similarity index 81% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py index 3809e343e..2ad1a2811 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py @@ -13,11 +13,13 @@ # limitations under the License. from abc import abstractmethod -from ..interfaces import PipelineComponentBaseInterface + from pyspark.sql import DataFrame +from src.sdk.python.rtdip_sdk.pipelines.interfaces import PipelineComponentBaseInterface + -class WranglerBaseInterface(PipelineComponentBaseInterface): +class DataManipulationBaseInterface(PipelineComponentBaseInterface): @abstractmethod def filter(self) -> DataFrame: pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py similarity index 87% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py index 1fa777e02..5cf6c743e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py @@ -14,18 +14,21 @@ from pyspark.sql.functions import desc from pyspark.sql import DataFrame as PySparkDataFrame -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class DuplicateDetection(WranglerBaseInterface): +class DuplicateDetection(DataManipulationBaseInterface): """ Cleanses a PySpark DataFrame from duplicates. Example -------- ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality.duplicate_detection import DuplicateDetection + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation.duplicate_detection import DuplicateDetection from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py similarity index 82% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py index 779e8a419..2727fd8c2 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py @@ -14,27 +14,28 @@ from datetime import timedelta import pandas as pd -from pyspark.sql.types import StringType from pyspark.sql import functions as F from pyspark.sql import SparkSession from pyspark.sql import DataFrame +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ...interfaces import DataManipulationBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ...interfaces import WranglerBaseInterface - -class IntervalFiltering(WranglerBaseInterface): +class IntervalFiltering(DataManipulationBaseInterface): """ - Cleanses a DataFrame by removing rows outside a specified interval window. Supported time stamp columns are DateType and StringType. + Cleanses a DataFrame by removing rows outside a specified interval window. + Example: + Parameters: spark (SparkSession): A SparkSession object. df (DataFrame): PySpark DataFrame to be converted interval (int): The interval length for cleansing. - interval_unit (str): 'hours', 'minutes', 'seconds' or 'milliseconds' to specify the unit of the interval. - time_stamp_column_name (str): The name of the column containing the time stamps. Default is 'EventTime'. - tolerance (int): The tolerance for the interval. Default is None. + interval_unit (str): 'hours', 'minutes', 'seconds' or 'milliseconds' to specify the unit of the interval. """ """ Default time stamp column name if not set in the constructor """ @@ -59,68 +60,6 @@ def __init__( else: self.time_stamp_column_name = time_stamp_column_name - def filter(self) -> DataFrame: - """ - Filters the DataFrame based on the interval - """ - - if self.time_stamp_column_name not in self.df.columns: - raise ValueError( - f"Column {self.time_stamp_column_name} not found in the DataFrame." - ) - is_string_time_stamp = isinstance( - self.df.schema[self.time_stamp_column_name].dataType, StringType - ) - - original_schema = self.df.schema - self.df = self.convert_column_to_timestamp().orderBy( - self.time_stamp_column_name - ) - - self.df.show() - - tolerance_in_ms = None - if self.tolerance is not None: - tolerance_in_ms = self.get_time_delta(self.tolerance).total_seconds() * 1000 - print(tolerance_in_ms) - - time_delta_in_ms = self.get_time_delta(self.interval).total_seconds() * 1000 - - rows = self.df.collect() - last_time_stamp = rows[0][self.time_stamp_column_name] - first_row = rows[0].asDict() - - first_row[self.time_stamp_column_name] = ( - self.format_date_time_to_string(first_row[self.time_stamp_column_name]) - if is_string_time_stamp - else first_row[self.time_stamp_column_name] - ) - - cleansed_df = [first_row] - - for i in range(1, len(rows)): - current_row = rows[i] - current_time_stamp = current_row[self.time_stamp_column_name] - - if self.check_if_outside_of_interval( - current_time_stamp, last_time_stamp, time_delta_in_ms, tolerance_in_ms - ): - current_row_dict = current_row.asDict() - current_row_dict[self.time_stamp_column_name] = ( - self.format_date_time_to_string( - current_row_dict[self.time_stamp_column_name] - ) - if is_string_time_stamp - else current_row_dict[self.time_stamp_column_name] - ) - - cleansed_df.append(current_row_dict) - last_time_stamp = current_time_stamp - - result_df = self.spark.createDataFrame(cleansed_df, schema=original_schema) - - return result_df - @staticmethod def system_type(): """ @@ -146,7 +85,6 @@ def convert_column_to_timestamp(self) -> DataFrame: except Exception as e: raise ValueError( f"Error converting column {self.time_stamp_column_name} to timestamp: {e}" - f"{self.df.schema[self.time_stamp_column_name].dataType} might be unsupported!" ) def get_time_delta(self, value: int) -> timedelta: @@ -186,3 +124,54 @@ def format_date_time_to_string(self, time_stamp: pd.Timestamp) -> str: return time_stamp.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] except Exception as e: raise ValueError(f"Error converting timestamp to string: {e}") + + def filter(self) -> DataFrame: + """ + Filters the DataFrame based on the interval + """ + + if self.time_stamp_column_name not in self.df.columns: + raise ValueError( + f"Column {self.time_stamp_column_name} not found in the DataFrame." + ) + + original_schema = self.df.schema + self.df = self.convert_column_to_timestamp().orderBy( + self.time_stamp_column_name + ) + + tolerance_in_ms = None + if self.tolerance is not None: + tolerance_in_ms = self.get_time_delta(self.tolerance).total_seconds() * 1000 + print(tolerance_in_ms) + + time_delta_in_ms = self.get_time_delta(self.interval).total_seconds() * 1000 + + rows = self.df.collect() + last_time_stamp = rows[0][self.time_stamp_column_name] + first_row = rows[0].asDict() + first_row[self.time_stamp_column_name] = self.format_date_time_to_string( + first_row[self.time_stamp_column_name] + ) + + cleansed_df = [first_row] + + for i in range(1, len(rows)): + current_row = rows[i] + current_time_stamp = current_row[self.time_stamp_column_name] + + if self.check_if_outside_of_interval( + current_time_stamp, last_time_stamp, time_delta_in_ms, tolerance_in_ms + ): + current_row_dict = current_row.asDict() + current_row_dict[self.time_stamp_column_name] = ( + self.format_date_time_to_string( + current_row_dict[self.time_stamp_column_name] + ) + ) + cleansed_df.append(current_row_dict) + last_time_stamp = current_time_stamp + + result_df = self.spark.createDataFrame(cleansed_df, schema=original_schema) + + return result_df diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py similarity index 93% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py index 7e5b9ecac..417898cc4 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py @@ -14,11 +14,14 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import mean, stddev, abs, col -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class KSigmaAnomalyDetection(WranglerBaseInterface): +class KSigmaAnomalyDetection(DataManipulationBaseInterface): """ Anomaly detection with the k-sigma method. This method either computes the mean and standard deviation, or the median and the median absolute deviation (MAD) of the data. The k-sigma method then filters out all data points that are k times the standard deviation away from the mean, or k times the MAD away from the median. @@ -27,7 +30,7 @@ class KSigmaAnomalyDetection(WranglerBaseInterface): Example -------- ```python - from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.k_sigma_anomaly_detection import KSigmaAnomalyDetection + from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_manipulation.k_sigma_anomaly_detection import KSigmaAnomalyDetection spark = ... # SparkSession df = ... # Get a PySpark DataFrame diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py index d784787cc..081068b41 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py @@ -20,11 +20,14 @@ import numpy as np from datetime import timedelta from typing import List -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class MissingValueImputation(WranglerBaseInterface): +class MissingValueImputation(DataManipulationBaseInterface): """ Imputes missing values in a univariate time series creating a continuous curve of data points. For that, the time intervals of each individual source is calculated, to then insert empty records at the missing timestamps with @@ -36,7 +39,7 @@ class MissingValueImputation(WranglerBaseInterface): from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import StructType, StructField, StringType - from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.missing_value_imputation import ( + from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_manipulation.missing_value_imputation import ( MissingValueImputation, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/__init__.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py similarity index 88% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py index 06b0a509b..e63bb6b03 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py @@ -12,14 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. from pyspark.sql import DataFrame as PySparkDataFrame -from .....data_wranglers.interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.interfaces import ( + DataManipulationBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) from .normalization import ( NormalizationBaseClass, ) -class Denormalization(WranglerBaseInterface): +class Denormalization(DataManipulationBaseInterface): """ #TODO Applies the appropriate denormalization method to revert values to their original scale. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py similarity index 93% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py index cb61d444f..fbe94b0ba 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py @@ -14,11 +14,16 @@ from abc import abstractmethod from pyspark.sql import DataFrame as PySparkDataFrame from typing import List -from .....data_wranglers.interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.interfaces import ( + DataManipulationBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class NormalizationBaseClass(WranglerBaseInterface): +class NormalizationBaseClass(DataManipulationBaseInterface): """ A base class for applying normalization techniques to multiple columns in a PySpark DataFrame. This class serves as a framework to support various normalization methods (e.g., Z-Score, Min-Max, and Mean), diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_mean.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_mean.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_mean.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_mean.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_minmax.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_minmax.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_minmax.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_minmax.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_zscore.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_zscore.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_zscore.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_zscore.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py similarity index 98% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py index 6a9d55776..793bea749 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py @@ -21,11 +21,14 @@ from statsmodels.tsa.arima.model import ARIMA, ARIMAResults from pmdarima import auto_arima -from ....interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from ....interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class ArimaPrediction(WranglerBaseInterface): +class ArimaPrediction(DataManipulationBaseInterface): """ Extends the timeseries data in given DataFrame with forecasted values from an ARIMA model. Can be optionally set to use auto_arima, which operates a bit like a grid search, in that it tries various sets of p and q (also P and Q diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py similarity index 89% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py index bffdc4cb6..05a0df443 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py @@ -15,7 +15,8 @@ from abc import abstractmethod from pyspark.sql import DataFrame -from ..interfaces import PipelineComponentBaseInterface + +from src.sdk.python.rtdip_sdk.pipelines.interfaces import PipelineComponentBaseInterface class MonitoringBaseInterface(PipelineComponentBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py index 8bf6f7d6d..632a04caa 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py @@ -4,8 +4,13 @@ from functools import reduce from operator import or_ -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) class CheckValueRanges(MonitoringBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py similarity index 94% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py index 64cf4c0f9..17b35100e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py @@ -17,8 +17,13 @@ from pyspark.sql.functions import col, when, lag, count, sum from pyspark.sql.window import Window -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) class FlatlineDetection(MonitoringBaseInterface): @@ -35,7 +40,7 @@ class FlatlineDetection(MonitoringBaseInterface): Example: ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality.flatline_detection import FlatlineDetection + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation.flatline_detection import FlatlineDetection from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[1]").appName("FlatlineDetectionExample").getOrCreate() diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py index f8022e41c..31e6e9be2 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py @@ -14,8 +14,13 @@ import great_expectations as gx from pyspark.sql import DataFrame, SparkSession -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) from great_expectations.checkpoint import ( Checkpoint, ) @@ -32,7 +37,7 @@ class GreatExpectationsDataQuality(MonitoringBaseInterface): Example -------- ```python - from src.sdk.python.rtdip_sdk.monitoring.data_quality.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality + from src.sdk.python.rtdip_sdk.monitoring.data_manipulation.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality from rtdip_sdk.pipelines.utilities import SparkSessionUtility import json diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py similarity index 83% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py index f7f63ee8c..6b3a8dd19 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py @@ -12,15 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + from pyspark.sql import DataFrame as PySparkDataFrame from pyspark.sql import functions as F from pyspark.sql.window import Window -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ....utilities.spark.time_string_parsing import parse_time_string_to_ms - -from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from src.sdk.python.rtdip_sdk.pipelines.utilities.spark.time_string_parsing import ( + parse_time_string_to_ms, +) class IdentifyMissingDataInterval(MonitoringBaseInterface): @@ -37,19 +44,22 @@ class IdentifyMissingDataInterval(MonitoringBaseInterface): mad_multiplier (float, optional): Multiplier for MAD to calculate tolerance. Default is 3. min_tolerance (str, optional): Minimum tolerance for pattern-based detection (e.g., '100ms'). Default is '10ms'. - Example: - ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality import IdentifyMissingDataInterval - from pyspark.sql import SparkSession + Returns: + df (pyspark.sql.Dataframe): Returns the original PySparkDataFrame without changes. - missing_data_monitor = IdentifyMissingDataInterval( - df=df, - interval='100ms', - tolerance='10ms', - ) + Example + -------- + ```python + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation import IdentifyMissingDataInterval + from pyspark.sql import SparkSession - df_result = missing_data_monitor.check() - ``` + missing_data_monitor = IdentifyMissingDataInterval( + df=df, + interval='100ms', + tolerance='10ms', + ) + + df_result = missing_data_monitor.check() """ @@ -70,10 +80,17 @@ def __init__( self.mad_multiplier = mad_multiplier self.min_tolerance = min_tolerance - # Use global pipeline logger - - self.logger_manager = LoggerManager() - self.logger = self.logger_manager.create_logger("IdentifyMissingDataInterval") + # Configure logging + self.logger = logging.getLogger(self.__class__.__name__) + if not self.logger.handlers: + # Prevent adding multiple handlers in interactive environments + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) @staticmethod def system_type(): @@ -93,13 +110,6 @@ def settings() -> dict: return {} def check(self) -> PySparkDataFrame: - """ - Executes the identify missing data logic. - - Returns: - pyspark.sql.DataFrame: - Returns the original PySpark DataFrame without changes. - """ if "EventTime" not in self.df.columns: self.logger.error("The DataFrame must contain an 'EventTime' column.") raise ValueError("The DataFrame must contain an 'EventTime' column.") diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py similarity index 92% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py index dd81ae579..97bb84107 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py @@ -4,10 +4,16 @@ from pyspark.sql import DataFrame as PySparkDataFrame from pyspark.sql import functions as F -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ....logging.logger_manager import LoggerManager -from ....utilities.spark.time_string_parsing import parse_time_string_to_ms +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from src.sdk.python.rtdip_sdk.pipelines.utilities.spark.time_string_parsing import ( + parse_time_string_to_ms, +) class IdentifyMissingDataPattern(MonitoringBaseInterface): @@ -28,30 +34,8 @@ class IdentifyMissingDataPattern(MonitoringBaseInterface): tolerance (str, optional): Maximum allowed deviation from the pattern (e.g., '1s', '500ms'). Default is '10ms'. - Example: - ```python - from pyspark.sql import SparkSession - - spark = SparkSession.builder.master("local[1]").appName("IdentifyMissingDataPatternExample").getOrCreate() - - patterns = [ - {"second": 0}, - {"second": 20}, - ] - - frequency = "minutely" - tolerance = "1s" - - identify_missing_data = IdentifyMissingDataPattern( - df=df, - patterns=patterns, - frequency=frequency, - tolerance=tolerance, - ) - - identify_missing_data.check() - ``` - + Returns: + PySparkDataFrame: Returns the original PySpark DataFrame without changes. """ df: PySparkDataFrame @@ -70,8 +54,16 @@ def __init__( self.tolerance = tolerance # Configure logging - - self.logger = LoggerManager().create_logger(self.__class__.__name__) + self.logger = logging.getLogger(self.__class__.__name__) + if not self.logger.handlers: + # Prevent adding multiple handlers in interactive environments + handler = logging.StreamHandler() + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) @staticmethod def system_type(): diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py deleted file mode 100644 index 24cb1a77a..000000000 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .spark.data_quality.duplicate_detection import * -from .spark.data_quality.normalization.normalization import * -from .spark.data_quality.normalization.normalization_mean import * -from .spark.data_quality.normalization.normalization_minmax import * -from .spark.data_quality.normalization.normalization_zscore import * -from .spark.data_quality.normalization.denormalization import * -from .spark.data_quality.prediction.arima import * -from .spark.data_quality.missing_value_imputation import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py deleted file mode 100644 index 4e3c1f154..000000000 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .spark.data_quality.great_expectations_data_quality import * -from .spark.data_quality.identify_missing_data_interval import * -from .spark.data_quality.identify_missing_data_pattern import * -from .spark.data_quality.flatline_detection import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py index 60396d776..5178ffd5e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py @@ -1,2 +1,16 @@ -from .cols_to_vector import * +# Copyright 2024 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .columns_to_vector import * from .polynomial_features import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/cols_to_vector.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py similarity index 98% rename from src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/cols_to_vector.py rename to src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py index 369a87cfe..e9da1241c 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/cols_to_vector.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py @@ -18,7 +18,7 @@ from ...interfaces import TransformerInterface -class ColsToVector(TransformerInterface): +class ColumnsToVector(TransformerInterface): """ Converts columns containing numbers to a column containing a vector. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py index 2f2d4b94a..c75137c47 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta import numpy as np import pandas as pd @@ -21,7 +20,9 @@ from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers import ArimaPrediction +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.prediction.arima import ( + ArimaPrediction, +) @pytest.fixture(scope="session") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py new file mode 100644 index 000000000..901166612 --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py @@ -0,0 +1,62 @@ +# Copyright 2022 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from pyspark.sql import SparkSession +from pyspark.sql.dataframe import DataFrame + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.duplicate_detection import ( + DuplicateDetection, +) + + +@pytest.fixture(scope="session") +def spark_session(): + return SparkSession.builder.master("local[2]").appName("test").getOrCreate() + + +def test_duplicate_detection(spark_session: SparkSession): + expected_df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + df = spark_session.createDataFrame( + [ + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", "0.340000004"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", "0.150000006"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", "0.129999995"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", "0.119999997"), + ("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", "0.129999995"), + ], + ["TagName", "EventTime", "Status", "Value"], + ) + + duplicate_detection_monitor = DuplicateDetection(df) + actual_df = duplicate_detection_monitor.filter() + + assert isinstance(actual_df, DataFrame) + + assert expected_df.columns == actual_df.columns + assert expected_df.schema == actual_df.schema + assert expected_df.collect() == actual_df.collect() diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py index 472df9f3a..91f35a7b7 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py @@ -16,7 +16,7 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.interval_filtering import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.interval_filtering import ( IntervalFiltering, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py index 9474645a9..b7a5ab860 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py @@ -1,6 +1,6 @@ from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.k_sigma_anomaly_detection import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.k_sigma_anomaly_detection import ( KSigmaAnomalyDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py index b45bb5e41..f9517a4a3 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py @@ -18,7 +18,7 @@ from pyspark.sql.functions import col, unix_timestamp, abs as A from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.missing_value_imputation import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.missing_value_imputation import ( MissingValueImputation, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py similarity index 88% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py index 6d6493ff7..a994792f9 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py @@ -11,19 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta import pytest from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers import ( - NormalizationBaseClass, +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.denormalization import ( Denormalization, - NormalizationMinMax, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.normalization import ( + NormalizationBaseClass, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.normalization_mean import ( NormalizationMean, ) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.normalization_minmax import ( + NormalizationMinMax, +) @pytest.fixture(scope="session") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..74ed9fc2f --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py index caf67f08f..f14690861 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py @@ -3,7 +3,7 @@ from io import StringIO import logging -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.check_value_ranges import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import ( CheckValueRanges, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py similarity index 97% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py index 37d097baa..ed5eb688c 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py @@ -1,6 +1,6 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.flatline_detection import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import ( FlatlineDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py index 00bb57902..51584ba4b 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py @@ -1,8 +1,7 @@ -import pytest from pytest_mock import MockerFixture -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.great_expectations_data_quality import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.great_expectations_data_quality import ( GreatExpectationsDataQuality, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py index 858dfae12..26fc2a375 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py @@ -1,8 +1,6 @@ import pytest from pyspark.sql import SparkSession - -from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.identify_missing_data_interval import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_interval import ( IdentifyMissingDataInterval, ) @@ -24,9 +22,8 @@ def spark(): @pytest.fixture def log_capture(): log_stream = StringIO() - logger_manager = LoggerManager() - logger = logger_manager.create_logger("IdentifyMissingDataInterval") - + logger = logging.getLogger("IdentifyMissingDataInterval") + logger.setLevel(logging.INFO) handler = logging.StreamHandler(log_stream) formatter = logging.Formatter("%(message)s") handler.setFormatter(formatter) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py index 2a3c39bc1..fa170dafc 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py @@ -8,7 +8,7 @@ from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.identify_missing_data_pattern import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern import ( IdentifyMissingDataPattern, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py deleted file mode 100644 index 7ce73fb42..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.duplicate_detection import ( - DuplicateDetection, -) - - -@pytest.fixture(scope="session") -def spark_session(): - return SparkSession.builder.master("local[2]").appName("test").getOrCreate() - - -@pytest.fixture -def test_data(spark_session): - data = [ - ("key1", "time1", "value1"), - ("key2", "time2", "value2"), - ("key2", "time3", "value2"), - ("key1", "time1", "value3"), - ("key4", "time4", "value4"), - ("key5", "time4", "value5"), - ] - columns = ["TagName", "EventTime", "Value"] - return spark_session.createDataFrame(data, columns) - - -def test_duplicate_detection_two_columns(spark_session, test_data): - expected_data = [ - ("key1", "time1", "value1"), - ("key2", "time2", "value2"), - ("key2", "time3", "value2"), - ("key4", "time4", "value4"), - ("key5", "time4", "value5"), - ] - columns = ["TagName", "EventTime", "Value"] - expected_df = spark_session.createDataFrame(expected_data, columns) - - duplicate_detection = DuplicateDetection( - test_data, primary_key_columns=["TagName", "EventTime"] - ) - result_df = duplicate_detection.filter() - result_df.show() - - assert ( - result_df.count() == expected_df.count() - ), "Row count does not match expected result" - assert sorted(result_df.collect()) == sorted( - expected_df.collect() - ), "Data does not match expected result" - - -def test_duplicate_detection_one_column(spark_session, test_data): - expected_data = [ - ("key1", "time1", "value1"), - ("key2", "time2", "value2"), - ("key4", "time4", "value4"), - ("key5", "time4", "value5"), - ] - columns = ["TagName", "EventTime", "Value"] - expected_df = spark_session.createDataFrame(expected_data, columns) - - duplicate_detection = DuplicateDetection(test_data, primary_key_columns=["TagName"]) - result_df = duplicate_detection.filter() - result_df.show() - - assert ( - result_df.count() == expected_df.count() - ), "Row count does not match expected result" - assert sorted(result_df.collect()) == sorted( - expected_df.collect() - ), "Data does not match expected result" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_one_hot_encoding.py b/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_one_hot_encoding.py deleted file mode 100644 index a1b6d035a..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_one_hot_encoding.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest - -from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, StringType, FloatType -from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.one_hot_encoding import ( - OneHotEncoding, -) - -# Define the schema outside the test functions -SCHEMA = StructType( - [ - StructField("TagName", StringType(), True), - StructField("EventTime", StringType(), True), - StructField("Status", StringType(), True), - StructField("Value", FloatType(), True), - ] -) - - -@pytest.fixture(scope="session") -def spark_session(): - return SparkSession.builder.master("local[2]").appName("test").getOrCreate() - - -def test_empty_df(spark_session): - """Empty DataFrame""" - empty_data = [] - empty_df = spark_session.createDataFrame(empty_data, SCHEMA) - encoder = OneHotEncoding(empty_df, "TagName") - result_df = encoder.transform() - - assert ( - result_df.count() == 0 - ), "Expected no rows in the result DataFrame for empty input." - assert result_df.columns == [ - "TagName", - "EventTime", - "Status", - "Value", - ], "Expected no new columns for empty DataFrame." - - -def test_single_unique_value(spark_session): - """Single Unique Value""" - data = [ - ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.34), - ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12", "Good", 0.15), - ] - df = spark_session.createDataFrame(data, SCHEMA) - encoder = OneHotEncoding(df, "TagName") - result_df = encoder.transform() - - expected_columns = [ - "TagName", - "EventTime", - "Status", - "Value", - "TagName_A2PS64V0J.:ZUX09R", - ] - assert ( - result_df.columns == expected_columns - ), "Columns do not match for single unique value." - for row in result_df.collect(): - assert ( - row["TagName_A2PS64V0J.:ZUX09R"] == 1 - ), "Expected 1 for the one-hot encoded column." - - -def test_null_values(spark_session): - """Column with Null Values""" - data = [ - ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.34), - (None, "2024-01-02 16:00:12", "Good", 0.15), - ] - df = spark_session.createDataFrame(data, SCHEMA) - encoder = OneHotEncoding(df, "TagName") - result_df = encoder.transform() - - expected_columns = [ - "TagName", - "EventTime", - "Status", - "Value", - "TagName_A2PS64V0J.:ZUX09R", - "TagName_None", - ] - assert ( - result_df.columns == expected_columns - ), f"Columns do not match for null value case. Expected {expected_columns}, but got {result_df.columns}" - for row in result_df.collect(): - if row["TagName"] == "A2PS64V0J.:ZUX09R": - assert ( - row["TagName_A2PS64V0J.:ZUX09R"] == 1 - ), "Expected 1 for valid TagName." - assert ( - row["TagName_None"] == 0 - ), "Expected 0 for TagName_None for valid TagName." - elif row["TagName"] is None: - assert ( - row["TagName_A2PS64V0J.:ZUX09R"] == 0 - ), "Expected 0 for TagName_A2PS64V0J.:ZUX09R for None TagName." - assert ( - row["TagName_None"] == 0 - ), "Expected 0 for TagName_None for None TagName." - - -def test_large_unique_values(spark_session): - """Large Number of Unique Values""" - data = [ - (f"Tag_{i}", f"2024-01-02 20:03:{i:02d}", "Good", i * 1.0) for i in range(1000) - ] - df = spark_session.createDataFrame(data, SCHEMA) - encoder = OneHotEncoding(df, "TagName") - result_df = encoder.transform() - - assert ( - len(result_df.columns) == len(SCHEMA.fields) + 1000 - ), "Expected 1000 additional columns for one-hot encoding." - - -def test_special_characters(spark_session): - """Special Characters in Column Values""" - data = [ - ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.34), - ("@Special#Tag!", "2024-01-02 16:00:12", "Good", 0.15), - ] - df = spark_session.createDataFrame(data, SCHEMA) - encoder = OneHotEncoding(df, "TagName") - result_df = encoder.transform() - - expected_columns = [ - "TagName", - "EventTime", - "Status", - "Value", - "TagName_A2PS64V0J.:ZUX09R", - "TagName_@Special#Tag!", - ] - assert ( - result_df.columns == expected_columns - ), "Columns do not match for special characters." - for row in result_df.collect(): - for tag in ["A2PS64V0J.:ZUX09R", "@Special#Tag!"]: - expected_value = 1 if row["TagName"] == tag else 0 - column_name = f"TagName_{tag}" - assert ( - row[column_name] == expected_value - ), f"Expected {expected_value} for {column_name}." - - -def test_distinct_value(spark_session): - """Dataset with Multiple TagName Values""" - - data = [ - ("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46", "Good", 0.3400000035762787), - ("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12", "Good", 0.15000000596046448), - ( - "-4O7LSSAM_3EA02:2GT7E02I_R_MP", - "2024-01-02 20:09:58", - "Good", - 7107.82080078125, - ), - ("_LT2EPL-9PM0.OROTENV3:", "2024-01-02 12:27:10", "Good", 19407.0), - ("1N325T3MTOR-P0L29:9.T0", "2024-01-02 23:41:10", "Good", 19376.0), - ] - - df = spark_session.createDataFrame(data, SCHEMA) - - encoder = OneHotEncoding(df, "TagName") - result_df = encoder.transform() - - result = result_df.collect() - - expected_columns = df.columns + [ - f"TagName_{row['TagName']}" for row in df.select("TagName").distinct().collect() - ] - - assert set(result_df.columns) == set(expected_columns) - - tag_names = df.select("TagName").distinct().collect() - for row in result: - tag_name = row["TagName"] - for tag in tag_names: - column_name = f"TagName_{tag['TagName']}" - if tag["TagName"] == tag_name: - assert row[column_name] == 1.0 - else: - assert row[column_name] == 0.0 diff --git a/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py b/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py index 493426d9e..3bae9691c 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py @@ -25,8 +25,8 @@ from src.sdk.python.rtdip_sdk.pipelines.machine_learning.spark.linear_regression import ( LinearRegression, ) -from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.cols_to_vector import ( - ColsToVector, +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.columns_to_vector import ( + ColumnsToVector, ) from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.polynomial_features import ( PolynomialFeatures, @@ -170,8 +170,8 @@ def sample_data(spark): # Test cases def test_cols_to_vector(sample_data): df = sample_data - # Pass the DataFrame to ColsToVector - cols_to_vector = ColsToVector(df=df, input_cols=["Value"], output_col="features") + # Pass the DataFrame to ColumnsToVector + cols_to_vector = ColumnsToVector(df=df, input_cols=["Value"], output_col="features") transformed_df = cols_to_vector.transform() assert "features" in transformed_df.columns @@ -180,8 +180,8 @@ def test_cols_to_vector(sample_data): def test_polynomial_features(sample_data): df = sample_data - # Convert 'Value' to a vector using ColsToVector - cols_to_vector = ColsToVector(df=df, input_cols=["Value"], output_col="features") + # Convert 'Value' to a vector using ColumnsToVector + cols_to_vector = ColumnsToVector(df=df, input_cols=["Value"], output_col="features") vectorized_df = cols_to_vector.transform() polynomial_features = PolynomialFeatures( @@ -197,8 +197,8 @@ def test_polynomial_features(sample_data): def test_linear_regression(sample_data): df = sample_data - # Use ColsToVector to assemble features into a single vector column - cols_to_vector = ColsToVector(df=df, input_cols=["Value"], output_col="features") + # Use ColumnsToVector to assemble features into a single vector column + cols_to_vector = ColumnsToVector(df=df, input_cols=["Value"], output_col="features") df = cols_to_vector.transform() linear_regression = LinearRegression( df, features_col="features", label_col="Value", prediction_col="prediction" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.