diff --git a/.github/ISSUE_TEMPLATE/amos_feature_request.md b/.github/ISSUE_TEMPLATE/amos_feature_request.md deleted file mode 100644 index 656f4751d..000000000 --- a/.github/ISSUE_TEMPLATE/amos_feature_request.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: AMOS Feature request -about: A feature request following the AMOS project guidelines -title: '' -labels: '' -assignees: '' - ---- -## User Story - -1. As a ... (Who?) -2. I want to ... (What?) -3. So that ... (Why?) - -## Additional context - -Add any other context or screenshots about the feature request here. - -## Acceptance Criteria - -- **Add acceptance criteria here** - -## Definition of Done - -- Test cases have been created and are runnin successfully -- Documentation for the new component was added -- Github Actions are running without errors - diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 82eab7a09..cc272246e 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -25,8 +25,6 @@ jobs: uses: rtdip/core/.github/workflows/test.yml@develop job_run_unit_tests_and_sonarqube: - # TODO: Remove in Merge to main project - # uses: amosproj/amos2024ws01-rtdip-data-quality-checker/.github/workflows/sonarcloud_reusable.yml@hotfix/pipeline-moto uses: rtdip/core/.github/workflows/sonarcloud_reusable.yml@develop with: REPO_NAME: "" diff --git a/.gitignore b/.gitignore index 94cd7d594..d7f045ff8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ __pycache__/ *.py[cod] *$py.class +# AMOS project specific folders +Deliverables/ +amos_feature_request.md + # C extensions *.so diff --git a/Deliverables/sprint-01/.gitkeep b/Deliverables/sprint-01/.gitkeep deleted file mode 100644 index 8b1378917..000000000 --- a/Deliverables/sprint-01/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - diff --git a/Deliverables/sprint-01/feature-board.png b/Deliverables/sprint-01/feature-board.png deleted file mode 100644 index dbe7c6102..000000000 Binary files a/Deliverables/sprint-01/feature-board.png and /dev/null differ diff --git a/Deliverables/sprint-01/feature-board.tsv b/Deliverables/sprint-01/feature-board.tsv deleted file mode 100644 index d8a9ae7f1..000000000 --- a/Deliverables/sprint-01/feature-board.tsv +++ /dev/null @@ -1,9 +0,0 @@ -Title URL Assignees Status Estimated size Real size -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Product Backlog -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Product Backlog -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 Product Backlog -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 Product Backlog -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 Product Backlog -[Component] Trend identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/5 Product Backlog -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 Product Backlog -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 Product Backlog \ No newline at end of file diff --git a/Deliverables/sprint-01/imp-squared-backlog.jpg b/Deliverables/sprint-01/imp-squared-backlog.jpg deleted file mode 100644 index 538d9e7b7..000000000 Binary files a/Deliverables/sprint-01/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-01/imp-squared-backlog.tsv b/Deliverables/sprint-01/imp-squared-backlog.tsv deleted file mode 100644 index 6d15b7eb2..000000000 --- a/Deliverables/sprint-01/imp-squared-backlog.tsv +++ /dev/null @@ -1 +0,0 @@ -Title Assignees Status \ No newline at end of file diff --git a/Deliverables/sprint-01/planning-documents.pdf b/Deliverables/sprint-01/planning-documents.pdf deleted file mode 100644 index 9460d0357..000000000 Binary files a/Deliverables/sprint-01/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-01/team-logo.png b/Deliverables/sprint-01/team-logo.png deleted file mode 100644 index 9cb6d3820..000000000 Binary files a/Deliverables/sprint-01/team-logo.png and /dev/null differ diff --git a/Deliverables/sprint-01/team-logo.svg b/Deliverables/sprint-01/team-logo.svg deleted file mode 100644 index f19f860cd..000000000 --- a/Deliverables/sprint-01/team-logo.svg +++ /dev/null @@ -1,116 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Deliverables/sprint-02/feature_board.jpg b/Deliverables/sprint-02/feature_board.jpg deleted file mode 100644 index 7a7c93045..000000000 Binary files a/Deliverables/sprint-02/feature_board.jpg and /dev/null differ diff --git a/Deliverables/sprint-02/feature_board.tsv b/Deliverables/sprint-02/feature_board.tsv deleted file mode 100644 index 4994bd6fe..000000000 --- a/Deliverables/sprint-02/feature_board.tsv +++ /dev/null @@ -1,18 +0,0 @@ -Title URL Assignees Status Estimated size Real size -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Awaiting Review -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Awaiting Review 8 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Awaiting Review 1 -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle In Progress -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 In Progress 3 -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 In Progress -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 In Progress 5 -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 -Develop a test pipeline to run during release Product Backlog -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Product Backlog -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Product Backlog -[Component] Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog -[Component] Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 Product Backlog -[Component] Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Product Backlog -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Product Backlog -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Product Backlog \ No newline at end of file diff --git a/Deliverables/sprint-02/imp-squared-backlog.jpg b/Deliverables/sprint-02/imp-squared-backlog.jpg deleted file mode 100644 index 2b77d6890..000000000 Binary files a/Deliverables/sprint-02/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-02/imp-squared-backlog.tsv b/Deliverables/sprint-02/imp-squared-backlog.tsv deleted file mode 100644 index 3fc247f1b..000000000 --- a/Deliverables/sprint-02/imp-squared-backlog.tsv +++ /dev/null @@ -1,6 +0,0 @@ -Title Assignees Status -Slack workspace (Avi) In Progress -Get to know expectations and requirements from Industry Partner In Progress -Homework no assigned clearly - now assigned In Progress -SD Meeting In Progress -Figure out pipeline bug - for everyone Todo \ No newline at end of file diff --git a/Deliverables/sprint-02/planning-documents.pdf b/Deliverables/sprint-02/planning-documents.pdf deleted file mode 100644 index 0b519b088..000000000 Binary files a/Deliverables/sprint-02/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-02/rtdip-sdk-sbom.md b/Deliverables/sprint-02/rtdip-sdk-sbom.md deleted file mode 100644 index 11727ac14..000000000 --- a/Deliverables/sprint-02/rtdip-sdk-sbom.md +++ /dev/null @@ -1,76 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [29.10.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|---------------------------------|-----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | - - -### Summary -- **Total Components**: 56 -- **Last Updated**: [ 29.10.2024] diff --git a/Deliverables/sprint-02/software-architecture.pdf b/Deliverables/sprint-02/software-architecture.pdf deleted file mode 100644 index 51a4a60d1..000000000 Binary files a/Deliverables/sprint-02/software-architecture.pdf and /dev/null differ diff --git a/Deliverables/sprint-03/feature-board.jpg b/Deliverables/sprint-03/feature-board.jpg deleted file mode 100644 index 401e0492c..000000000 Binary files a/Deliverables/sprint-03/feature-board.jpg and /dev/null differ diff --git a/Deliverables/sprint-03/feature-board.tsv b/Deliverables/sprint-03/feature-board.tsv deleted file mode 100644 index 754668937..000000000 --- a/Deliverables/sprint-03/feature-board.tsv +++ /dev/null @@ -1,24 +0,0 @@ -Title URL Assignees Status Estimated size Real size Labels -Create a test pipeline to run during release https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/24 FelipeTrost In Progress 5 -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Awaiting Review 3 enhancement -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle Awaiting Review 8 enhancement -[Component] Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 kristen149, Timm638 In Progress 8 -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 chris-1187 In Progress 5 -[Component] Clean data based on Interval/Pattern https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/22 chris-1187, dh1542 In Progress -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 Feature Archive 3 5 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Feature Archive 1 1 -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 bug -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Feature Archive 8 8 enhancement -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Feature Archive duplicate -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Feature Archive good first issue -Time series prediction with linear regression https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/28 Product Backlog component -Time Series prediction using ARIMA https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/29 Product Backlog component -Store monitoring outputs in a standardized format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/26 Product Backlog enhancement -Validation of value ranges https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/31 Product Backlog component -Advanced Duplicate Detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/30 Product Backlog component -Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog component -Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Product Backlog component -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive documentation -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Feature Archive -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Feature Archive -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Feature Archive \ No newline at end of file diff --git a/Deliverables/sprint-03/imp-squared-backlog.jpg b/Deliverables/sprint-03/imp-squared-backlog.jpg deleted file mode 100644 index fe18f9ab2..000000000 Binary files a/Deliverables/sprint-03/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-03/imp-squared-backlog.tsv b/Deliverables/sprint-03/imp-squared-backlog.tsv deleted file mode 100644 index 3fc247f1b..000000000 --- a/Deliverables/sprint-03/imp-squared-backlog.tsv +++ /dev/null @@ -1,6 +0,0 @@ -Title Assignees Status -Slack workspace (Avi) In Progress -Get to know expectations and requirements from Industry Partner In Progress -Homework no assigned clearly - now assigned In Progress -SD Meeting In Progress -Figure out pipeline bug - for everyone Todo \ No newline at end of file diff --git a/Deliverables/sprint-03/planning-documents.pdf b/Deliverables/sprint-03/planning-documents.pdf deleted file mode 100644 index 7b62ce61d..000000000 Binary files a/Deliverables/sprint-03/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-03/rtdip-sdk-sbom.md b/Deliverables/sprint-03/rtdip-sdk-sbom.md deleted file mode 100644 index b97351fc2..000000000 --- a/Deliverables/sprint-03/rtdip-sdk-sbom.md +++ /dev/null @@ -1,87 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [05.11.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|--------------------------------|----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | -| 57 | python | >=3.9,<3.12 | Python Software Foundation | PSF | Python programming language | -| 58 | pyodbc | >=4.0.39,<5.0.0 | N/A | MIT | ODBC library for Python | -| 59 | twine | ==4.0.2 | PyPA | Apache 2.0 | Python package publishing tool | -| 60 | black | >=24.1.0 | Python Software Foundation | MIT | Code formatter for Python | -| 61 | great-expectations | >=0.18.8,<1.0.0 | N/A | Apache 2.0 | Data validation tool | -| 62 | azure-functions | >=1.15.0,<2.0.0 | Microsoft | MIT | Functions for Azure services | -| 63 | build | ==0.10.0 | PyPA | MIT | Python package build tool | -| 64 | deltalake | >=0.10.1,<1.0.0 | Delta, Inc. | Apache 2.0 | Delta Lake interaction for Python | -| 65 | trio | >=0.22.1 | Python Software Foundation | MIT | Async library for concurrency | -| 66 | eth-typing | >=4.2.3,<5.0.0 | Ethereum Foundation | MIT | Ethereum types library | -| 67 | moto[s3] | >=5.0.16,<6.0.0 | Spulec | Apache 2.0 | Mock library for AWS S3 | -| 68 | pyarrow | >=14.0.1,<17.0.0 | Apache Arrow | Apache 2.0 | Columnar data storage and processing | - -### Summary -- **Total Components**: 68 -- **Last Updated**: [05.11.2024] diff --git a/Deliverables/sprint-04/Screenshot 2024-11-13 130226.png b/Deliverables/sprint-04/Screenshot 2024-11-13 130226.png deleted file mode 100644 index e6bb876fe..000000000 Binary files a/Deliverables/sprint-04/Screenshot 2024-11-13 130226.png and /dev/null differ diff --git a/Deliverables/sprint-04/Screenshot 2024-11-13 130453.png b/Deliverables/sprint-04/Screenshot 2024-11-13 130453.png deleted file mode 100644 index 4cee2b492..000000000 Binary files a/Deliverables/sprint-04/Screenshot 2024-11-13 130453.png and /dev/null differ diff --git a/Deliverables/sprint-04/amos2024ws01-feature-board - Board.tsv b/Deliverables/sprint-04/amos2024ws01-feature-board - Board.tsv deleted file mode 100644 index ef525c90c..000000000 --- a/Deliverables/sprint-04/amos2024ws01-feature-board - Board.tsv +++ /dev/null @@ -1,32 +0,0 @@ -Title URL Assignees Status Estimated size Real size Labels -Time Series prediction using ARIMA https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/29 Timm638 Feature Archive 13 8 component -Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 kristen149, Timm638 Feature Archive 8 8 component -Clean data based on Interval/Pattern https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/22 dh1542 Feature Archive 8 8 component -Create a test pipeline to run during release https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/24 FelipeTrost Feature Archive 5 1 -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle Feature Archive 8 8 enhancement -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 chris-1187 Feature Archive 5 5 -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Feature Archive 3 8 enhancement -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 Feature Archive 3 5 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Feature Archive 1 1 -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 3 bug -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Feature Archive 8 8 enhancement -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Feature Archive duplicate -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Feature Archive good first issue -Time series prediction with linear regression https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/28 FelipeTrost, kristen149 Product Backlog 8 component -Missing value imputation https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/36 chris-1187 Product Backlog 13 component -Store monitoring outputs in a standardized format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/26 dh1542 Product Backlog 13 enhancement -Validation of value ranges https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/31 mollle Product Backlog 3 component -Finish integrating ARIMA functionality of statsmodels into RTDIP https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/40 Product Backlog component -Reduce number of parameters needed to use ArimaPrediction effectively https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/41 Product Backlog component -Prepare RTDIP demo https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/43 Product Backlog -Advanced Duplicate Detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/30 Product Backlog component -Flatline detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/44 Product Backlog component -One-Hot Encoding https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/45 Product Backlog component -Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog component -Data Binning https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/46 Product Backlog component -Interval Screening and Missing Entry Insertion https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/47 Product Backlog component -Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Feature Archive component -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive documentation -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Feature Archive -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Feature Archive -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Feature Archive \ No newline at end of file diff --git a/Deliverables/sprint-04/imp-squared-backlog.tsv b/Deliverables/sprint-04/imp-squared-backlog.tsv deleted file mode 100644 index 6b8507ae8..000000000 --- a/Deliverables/sprint-04/imp-squared-backlog.tsv +++ /dev/null @@ -1,10 +0,0 @@ -Title Assignees Status -Slack workspace (Avi) Done -SD Meeting Done -Homework no assigned clearly - now assigned Done -Figure out pipeline bug - for everyone Done -Get to know expectations and requirements from Industry Partner In Progress -Coordinated PR Reviews Todo -Make sure everyone can run the product (for example via readme doc) Todo -No expericence in ML Todo -Long term planning such as: (notes in description) Todo \ No newline at end of file diff --git a/Deliverables/sprint-04/planning-documents.pdf b/Deliverables/sprint-04/planning-documents.pdf deleted file mode 100644 index 2d3a43501..000000000 Binary files a/Deliverables/sprint-04/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-04/rtdip-sdk-sbom.md b/Deliverables/sprint-04/rtdip-sdk-sbom.md deleted file mode 100644 index b97351fc2..000000000 --- a/Deliverables/sprint-04/rtdip-sdk-sbom.md +++ /dev/null @@ -1,87 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [05.11.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|--------------------------------|----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | -| 57 | python | >=3.9,<3.12 | Python Software Foundation | PSF | Python programming language | -| 58 | pyodbc | >=4.0.39,<5.0.0 | N/A | MIT | ODBC library for Python | -| 59 | twine | ==4.0.2 | PyPA | Apache 2.0 | Python package publishing tool | -| 60 | black | >=24.1.0 | Python Software Foundation | MIT | Code formatter for Python | -| 61 | great-expectations | >=0.18.8,<1.0.0 | N/A | Apache 2.0 | Data validation tool | -| 62 | azure-functions | >=1.15.0,<2.0.0 | Microsoft | MIT | Functions for Azure services | -| 63 | build | ==0.10.0 | PyPA | MIT | Python package build tool | -| 64 | deltalake | >=0.10.1,<1.0.0 | Delta, Inc. | Apache 2.0 | Delta Lake interaction for Python | -| 65 | trio | >=0.22.1 | Python Software Foundation | MIT | Async library for concurrency | -| 66 | eth-typing | >=4.2.3,<5.0.0 | Ethereum Foundation | MIT | Ethereum types library | -| 67 | moto[s3] | >=5.0.16,<6.0.0 | Spulec | Apache 2.0 | Mock library for AWS S3 | -| 68 | pyarrow | >=14.0.1,<17.0.0 | Apache Arrow | Apache 2.0 | Columnar data storage and processing | - -### Summary -- **Total Components**: 68 -- **Last Updated**: [05.11.2024] diff --git a/Deliverables/sprint-05/build-process-video.mkv b/Deliverables/sprint-05/build-process-video.mkv deleted file mode 100644 index 307169130..000000000 Binary files a/Deliverables/sprint-05/build-process-video.mkv and /dev/null differ diff --git a/Deliverables/sprint-05/feature-board.jpg b/Deliverables/sprint-05/feature-board.jpg deleted file mode 100644 index 7b97c2c6f..000000000 Binary files a/Deliverables/sprint-05/feature-board.jpg and /dev/null differ diff --git a/Deliverables/sprint-05/feature-board.tsv b/Deliverables/sprint-05/feature-board.tsv deleted file mode 100644 index e4b9d785d..000000000 --- a/Deliverables/sprint-05/feature-board.tsv +++ /dev/null @@ -1,32 +0,0 @@ -Title URL Assignees Status Estimated size Real size Labels -Time series prediction with linear regression https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/28 FelipeTrost, kristen149 Awaiting Review 8 component -Missing value imputation https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/36 chris-1187 Awaiting Review 13 component -Validation of value ranges https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/31 mollle Awaiting Review 3 component -Time Series prediction using ARIMA https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/29 Timm638 Awaiting Review 13 8 component -Flatline detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/44 mollle Awaiting Review component -Advanced Duplicate Detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/30 mollle In Progress component -One-Hot Encoding https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/45 kristen149 In Progress component -Prepare RTDIP demo https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/43 Timm638 In Progress 8 -Store monitoring outputs in a standardized format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/26 dh1542 In Progress 13 enhancement -Finish integrating ARIMA functionality of statsmodels into RTDIP https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/40 Sprint Backlog 5 component -Reduce number of parameters needed to use ArimaPrediction effectively https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/41 chris-1187 Sprint Backlog 8 component -Normalization of Data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/18 kristen149, Timm638 Feature Archive 8 8 component -Clean data based on Interval/Pattern https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/22 dh1542 Feature Archive 8 8 component -Create a test pipeline to run during release https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/24 FelipeTrost Feature Archive 5 1 -[Component] Identify missing data https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/2 mollle Feature Archive 8 8 enhancement -Explore the test data and brainstorm RTDIP component ideas https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/11 chris-1187 Feature Archive 5 5 -[Component] Anomaly detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/6 FelipeTrost Feature Archive 3 8 enhancement -[sprint-02] Create software architecture diagram https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/10 dh1542, Timm638 Feature Archive 3 5 -[sprint-02] Create software bill of materials https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/9 kristen149 Feature Archive 1 1 -Fix broken virtual environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/8 dh1542, Timm638 Feature Archive 3 3 bug -[Component] Duplicate detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/4 chris-1187, dh1542 Feature Archive 8 8 enhancement -[Component] Outlier detection https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/3 FelipeTrost Feature Archive duplicate -Set up a development environment https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/1 Feature Archive good first issue -Dimensionality Reduction https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/17 Product Backlog component -Data Binning https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/46 Product Backlog component -Interval Screening and Missing Entry Insertion https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/47 Product Backlog component -Alternative Preprocessing Methods https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/19 Feature Archive component -Please adopt the Deliverables folder structure from https://github.com/amosproj/amos202Xss0Y-projname to your repo / branch https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/7 Feature Archive documentation -[Component] Trend Identification https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/20 Feature Archive -Define clear acceptance criteria for components https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/16 Feature Archive -[Component] Data Format https://github.com/amosproj/amos2024ws01-rtdip-data-quality-checker/issues/21 Feature Archive \ No newline at end of file diff --git a/Deliverables/sprint-05/imp-squared-backlog.jpg b/Deliverables/sprint-05/imp-squared-backlog.jpg deleted file mode 100644 index 2b4a02aa5..000000000 Binary files a/Deliverables/sprint-05/imp-squared-backlog.jpg and /dev/null differ diff --git a/Deliverables/sprint-05/imp-squared-backlog.tsv b/Deliverables/sprint-05/imp-squared-backlog.tsv deleted file mode 100644 index 4e21cef7a..000000000 --- a/Deliverables/sprint-05/imp-squared-backlog.tsv +++ /dev/null @@ -1,12 +0,0 @@ -Title Assignees Status -Coordinated PR Reviews In Progress -Make sure everyone can run the product (for example via readme doc) In Progress -No expericence in ML In Progress -Discuss with industry partner an optimal time for the meeting to take place In Progress -Make sure the team meeting ends at 14:00 In Progress -Slack workspace (Avi) Done -SD Meeting Done -Homework no assigned clearly - now assigned Done -Figure out pipeline bug - for everyone Done -Get to know expectations and requirements from Industry Partner Done -Long term planning such as: (notes in description) Todo \ No newline at end of file diff --git a/Deliverables/sprint-05/planning-documents.pdf b/Deliverables/sprint-05/planning-documents.pdf deleted file mode 100644 index 3fc736057..000000000 Binary files a/Deliverables/sprint-05/planning-documents.pdf and /dev/null differ diff --git a/Deliverables/sprint-05/rtdip-sdk-sbom.md b/Deliverables/sprint-05/rtdip-sdk-sbom.md deleted file mode 100644 index b97351fc2..000000000 --- a/Deliverables/sprint-05/rtdip-sdk-sbom.md +++ /dev/null @@ -1,87 +0,0 @@ -# Software Bill of Materials (SBOM) - -## Project Name: rtdip-sdk -## Version: [RTDIP_SDK_1] -## Date: [05.11.2024] -## License: Apache License, Version 2.0 - -### Overview -This SBOM lists all required and optional dependencies for the `rtdip-sdk` project, including their versions and licenses. - -### Components - - -| Field | Name | Version Range | Supplier | License | Comment | -|-------|--------------------------------|----------------------|-----------------------------|--------------------|--------------------------| -| 1 | databricks-sql-connector | >=3.1.0,<4.0.0 | Databricks, Inc. | Apache 2.0 | SQL connector for Databricks | -| 2 | azure-identity | >=1.12.0,<2.0.0 | Microsoft | MIT | Identity management for Azure | -| 3 | pandas | >=1.5.2,<2.2.0 | The Pandas Development Team | BSD 3-Clause | Data manipulation library | -| 4 | jinja2 | >=3.1.4,<4.0.0 | Jinja2 Team | BSD 3-Clause | Template engine for Python | -| 5 | importlib_metadata | >=7.0.0 | PyPa | MIT | Metadata for Python packages | -| 6 | semver | >=3.0.0,<4.0.0 | Mikhail Korobeynikov | MIT | Semantic versioning library | -| 7 | xlrd | >=2.0.1,<3.0.0 | Python Software Foundation | MIT | Library for reading Excel files | -| 8 | grpcio | >=1.48.1 | Google LLC | Apache 2.0 | gRPC library for Python | -| 9 | grpcio-status | >=1.48.1 | Google LLC | Apache 2.0 | gRPC status library | -| 10 | googleapis-common-protos | >=1.56.4 | Google LLC | Apache 2.0 | Common protobufs for Google APIs | -| 11 | langchain | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Framework for LLMs | -| 12 | langchain-community | >=0.2.0,<0.3.0 | Harrison Chase | MIT | Community contributions to LangChain | -| 13 | openai | >=1.13.3,<2.0.0 | OpenAI | MIT | OpenAI API client | -| 14 | pydantic | >=2.6.0,<3.0.0 | Samuel Colvin | MIT | Data validation library | -| 15 | pyspark | >=3.3.0,<3.6.0 | The Apache Software Foundation | Apache 2.0 | Spark library for Python | -| 16 | delta-spark | >=2.2.0,<3.3.0 | Databricks, Inc. | Apache 2.0 | Delta Lake integration with Spark | -| 17 | dependency-injector | >=4.41.0,<5.0.0 | Paul Ganssle | MIT | Dependency injection framework | -| 18 | databricks-sdk | >=0.20.0,<1.0.0 | Databricks, Inc. | Apache 2.0 | SDK for Databricks services | -| 19 | azure-storage-file-datalake | >=12.12.0,<13.0.0 | Microsoft | MIT | Azure Data Lake Storage client | -| 20 | azure-mgmt-storage | >=21.0.0 | Microsoft | MIT | Azure Storage management client | -| 21 | azure-mgmt-eventgrid | >=10.2.0 | Microsoft | MIT | Azure Event Grid management client | -| 22 | boto3 | >=1.28.2,<2.0.0 | Amazon Web Services | Apache 2.0 | AWS SDK for Python | -| 23 | hvac | >=1.1.1 | HashiCorp | MPL 2.0 | HashiCorp Vault client | -| 24 | azure-keyvault-secrets | >=4.7.0,<5.0.0 | Microsoft | MIT | Azure Key Vault secrets management | -| 25 | web3 | >=6.18.0,<7.0.0 | N/A | MIT | Ethereum blockchain library | -| 26 | polars[deltalake] | >=0.18.8,<1.0.0 | N/A | MIT | DataFrame library with Delta Lake support | -| 27 | delta-sharing | >=1.0.0,<1.1.0 | N/A | Apache 2.0 | Delta Sharing library | -| 28 | xarray | >=2023.1.0,<2023.8.0 | N/A | BSD 3-Clause | N-dimensional array library | -| 29 | ecmwf-api-client | >=1.6.3,<2.0.0 | N/A | Apache 2.0 | ECMWF API client | -| 30 | netCDF4 | >=1.6.4,<2.0.0 | N/A | BSD 3-Clause | NetCDF file reading/writing | -| 31 | joblib | >=1.3.2,<2.0.0 | N/A | BSD 3-Clause | Lightweight pipelining library | -| 32 | sqlparams | >=5.1.0,<6.0.0 | N/A | MIT | SQL query parameters library | -| 33 | entsoe-py | >=0.5.10,<1.0.0 | N/A | MIT | ENTSOE API client | -| 34 | pytest | ==7.4.0 | N/A | MIT | Testing framework | -| 35 | pytest-mock | ==3.11.1 | N/A | MIT | Mocking for pytest | -| 36 | pytest-cov | ==4.1.0 | N/A | MIT | Coverage reporting for pytest | -| 37 | pylint | ==2.17.4 | N/A | GPL 2.0 | Static code analysis for Python | -| 38 | pip | >=23.1.2 | N/A | MIT | Python package installer | -| 39 | turbodbc | ==4.11.0 | N/A | MIT | ODBC interface for Python | -| 40 | numpy | >=1.23.4,<2.0.0 | NumPy Developers | BSD 3-Clause | Numerical computing library | -| 41 | oauthlib | >=3.2.2,<4.0.0 | N/A | MIT | OAuth library | -| 42 | cryptography | >=38.0.3 | N/A | MIT | Cryptography library | -| 43 | fastapi | >=0.110.0,<1.0.0 | Sebastián Ramírez | MIT | Fast web framework | -| 44 | httpx | >=0.24.1,<1.0.0 | N/A | MIT | HTTP client for Python | -| 45 | openjdk | >=11.0.15,<12.0.0 | N/A | N/A | OpenJDK Java runtime | -| 46 | mkdocs-material | ==9.5.20 | N/A | MIT | Material theme for MkDocs | -| 47 | mkdocs-material-extensions | ==1.3.1 | N/A | MIT | Extensions for MkDocs | -| 48 | mkdocstrings | ==0.25.0 | N/A | MIT | Documentation generation | -| 49 | mkdocstrings-python | ==1.10.8 | N/A | MIT | Python support for mkdocstrings | -| 50 | mkdocs-macros-plugin | ==1.0.1 | N/A | MIT | Macros for MkDocs | -| 51 | mkdocs-autorefs | >=1.0.0,<1.1.0 | N/A | MIT | Automatic references for MkDocs | -| 52 | pygments | ==2.16.1 | N/A | BSD 2-Clause | Syntax highlighting library | -| 53 | pymdown-extensions | ==10.8.1 | N/A | MIT | Extensions for Markdown | -| 54 | pygithub | >=1.59.0 | N/A | MIT | GitHub API client | -| 55 | pyjwt | >=2.8.0,<3.0.0 | N/A | MIT | JSON Web | -| 56 | conda | >=24.9.2 | N/A | BSD 3-Clause | Package installer | -| 57 | python | >=3.9,<3.12 | Python Software Foundation | PSF | Python programming language | -| 58 | pyodbc | >=4.0.39,<5.0.0 | N/A | MIT | ODBC library for Python | -| 59 | twine | ==4.0.2 | PyPA | Apache 2.0 | Python package publishing tool | -| 60 | black | >=24.1.0 | Python Software Foundation | MIT | Code formatter for Python | -| 61 | great-expectations | >=0.18.8,<1.0.0 | N/A | Apache 2.0 | Data validation tool | -| 62 | azure-functions | >=1.15.0,<2.0.0 | Microsoft | MIT | Functions for Azure services | -| 63 | build | ==0.10.0 | PyPA | MIT | Python package build tool | -| 64 | deltalake | >=0.10.1,<1.0.0 | Delta, Inc. | Apache 2.0 | Delta Lake interaction for Python | -| 65 | trio | >=0.22.1 | Python Software Foundation | MIT | Async library for concurrency | -| 66 | eth-typing | >=4.2.3,<5.0.0 | Ethereum Foundation | MIT | Ethereum types library | -| 67 | moto[s3] | >=5.0.16,<6.0.0 | Spulec | Apache 2.0 | Mock library for AWS S3 | -| 68 | pyarrow | >=14.0.1,<17.0.0 | Apache Arrow | Apache 2.0 | Columnar data storage and processing | - -### Summary -- **Total Components**: 68 -- **Last Updated**: [05.11.2024] diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py similarity index 81% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py index 3809e343e..2ad1a2811 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/interfaces.py @@ -13,11 +13,13 @@ # limitations under the License. from abc import abstractmethod -from ..interfaces import PipelineComponentBaseInterface + from pyspark.sql import DataFrame +from src.sdk.python.rtdip_sdk.pipelines.interfaces import PipelineComponentBaseInterface + -class WranglerBaseInterface(PipelineComponentBaseInterface): +class DataManipulationBaseInterface(PipelineComponentBaseInterface): @abstractmethod def filter(self) -> DataFrame: pass diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py similarity index 87% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py index 1fa777e02..5cf6c743e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/duplicate_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/duplicate_detection.py @@ -14,18 +14,21 @@ from pyspark.sql.functions import desc from pyspark.sql import DataFrame as PySparkDataFrame -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class DuplicateDetection(WranglerBaseInterface): +class DuplicateDetection(DataManipulationBaseInterface): """ Cleanses a PySpark DataFrame from duplicates. Example -------- ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality.duplicate_detection import DuplicateDetection + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation.duplicate_detection import DuplicateDetection from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py similarity index 96% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py index 779e8a419..71bb00ec7 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/interval_filtering.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/interval_filtering.py @@ -19,12 +19,14 @@ from pyspark.sql import SparkSession from pyspark.sql import DataFrame +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from ...interfaces import DataManipulationBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ...interfaces import WranglerBaseInterface - -class IntervalFiltering(WranglerBaseInterface): +class IntervalFiltering(DataManipulationBaseInterface): """ Cleanses a DataFrame by removing rows outside a specified interval window. Supported time stamp columns are DateType and StringType. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py similarity index 93% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py index 7e5b9ecac..417898cc4 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/k_sigma_anomaly_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/k_sigma_anomaly_detection.py @@ -14,11 +14,14 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import mean, stddev, abs, col -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class KSigmaAnomalyDetection(WranglerBaseInterface): +class KSigmaAnomalyDetection(DataManipulationBaseInterface): """ Anomaly detection with the k-sigma method. This method either computes the mean and standard deviation, or the median and the median absolute deviation (MAD) of the data. The k-sigma method then filters out all data points that are k times the standard deviation away from the mean, or k times the MAD away from the median. @@ -27,7 +30,7 @@ class KSigmaAnomalyDetection(WranglerBaseInterface): Example -------- ```python - from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.k_sigma_anomaly_detection import KSigmaAnomalyDetection + from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_manipulation.k_sigma_anomaly_detection import KSigmaAnomalyDetection spark = ... # SparkSession df = ... # Get a PySpark DataFrame diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py index d784787cc..081068b41 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/missing_value_imputation.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/missing_value_imputation.py @@ -20,11 +20,14 @@ import numpy as np from datetime import timedelta from typing import List -from ...interfaces import WranglerBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from ...interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class MissingValueImputation(WranglerBaseInterface): +class MissingValueImputation(DataManipulationBaseInterface): """ Imputes missing values in a univariate time series creating a continuous curve of data points. For that, the time intervals of each individual source is calculated, to then insert empty records at the missing timestamps with @@ -36,7 +39,7 @@ class MissingValueImputation(WranglerBaseInterface): from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import StructType, StructField, StringType - from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.missing_value_imputation import ( + from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_manipulation.missing_value_imputation import ( MissingValueImputation, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/__init__.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py similarity index 88% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py index 06b0a509b..e63bb6b03 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/denormalization.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/denormalization.py @@ -12,14 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. from pyspark.sql import DataFrame as PySparkDataFrame -from .....data_wranglers.interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.interfaces import ( + DataManipulationBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) from .normalization import ( NormalizationBaseClass, ) -class Denormalization(WranglerBaseInterface): +class Denormalization(DataManipulationBaseInterface): """ #TODO Applies the appropriate denormalization method to revert values to their original scale. diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py similarity index 93% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py index cb61d444f..fbe94b0ba 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization.py @@ -14,11 +14,16 @@ from abc import abstractmethod from pyspark.sql import DataFrame as PySparkDataFrame from typing import List -from .....data_wranglers.interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.interfaces import ( + DataManipulationBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class NormalizationBaseClass(WranglerBaseInterface): +class NormalizationBaseClass(DataManipulationBaseInterface): """ A base class for applying normalization techniques to multiple columns in a PySpark DataFrame. This class serves as a framework to support various normalization methods (e.g., Z-Score, Min-Max, and Mean), diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_mean.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_mean.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_mean.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_mean.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_minmax.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_minmax.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_minmax.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_minmax.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_zscore.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_zscore.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/normalization/normalization_zscore.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/normalization/normalization_zscore.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/__init__.py similarity index 100% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/__init__.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/__init__.py diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py similarity index 98% rename from src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py index 6a9d55776..793bea749 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/prediction/arima.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/data_quality/prediction/arima.py @@ -21,11 +21,14 @@ from statsmodels.tsa.arima.model import ARIMA, ARIMAResults from pmdarima import auto_arima -from ....interfaces import WranglerBaseInterface -from ....._pipeline_utils.models import Libraries, SystemType +from ....interfaces import DataManipulationBaseInterface +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) -class ArimaPrediction(WranglerBaseInterface): +class ArimaPrediction(DataManipulationBaseInterface): """ Extends the timeseries data in given DataFrame with forecasted values from an ARIMA model. Can be optionally set to use auto_arima, which operates a bit like a grid search, in that it tries various sets of p and q (also P and Q diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py similarity index 89% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py index bffdc4cb6..05a0df443 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/interfaces.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/interfaces.py @@ -15,7 +15,8 @@ from abc import abstractmethod from pyspark.sql import DataFrame -from ..interfaces import PipelineComponentBaseInterface + +from src.sdk.python.rtdip_sdk.pipelines.interfaces import PipelineComponentBaseInterface class MonitoringBaseInterface(PipelineComponentBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py index 8bf6f7d6d..632a04caa 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/check_value_ranges.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/check_value_ranges.py @@ -4,8 +4,13 @@ from functools import reduce from operator import or_ -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) class CheckValueRanges(MonitoringBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py similarity index 94% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py index 64cf4c0f9..17b35100e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/flatline_detection.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/flatline_detection.py @@ -17,8 +17,13 @@ from pyspark.sql.functions import col, when, lag, count, sum from pyspark.sql.window import Window -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) class FlatlineDetection(MonitoringBaseInterface): @@ -35,7 +40,7 @@ class FlatlineDetection(MonitoringBaseInterface): Example: ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality.flatline_detection import FlatlineDetection + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation.flatline_detection import FlatlineDetection from pyspark.sql import SparkSession spark = SparkSession.builder.master("local[1]").appName("FlatlineDetectionExample").getOrCreate() diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py index f8022e41c..31e6e9be2 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/great_expectations_data_quality.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/great_expectations_data_quality.py @@ -14,8 +14,13 @@ import great_expectations as gx from pyspark.sql import DataFrame, SparkSession -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) from great_expectations.checkpoint import ( Checkpoint, ) @@ -32,7 +37,7 @@ class GreatExpectationsDataQuality(MonitoringBaseInterface): Example -------- ```python - from src.sdk.python.rtdip_sdk.monitoring.data_quality.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality + from src.sdk.python.rtdip_sdk.monitoring.data_manipulation.great_expectations.python.great_expectations_data_quality import GreatExpectationsDataQuality from rtdip_sdk.pipelines.utilities import SparkSessionUtility import json diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py similarity index 92% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py index f7f63ee8c..b68a31fc0 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_interval.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.py @@ -16,9 +16,16 @@ from pyspark.sql import functions as F from pyspark.sql.window import Window -from ...interfaces import MonitoringBaseInterface -from ...._pipeline_utils.models import Libraries, SystemType -from ....utilities.spark.time_string_parsing import parse_time_string_to_ms +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from src.sdk.python.rtdip_sdk.pipelines.utilities.spark.time_string_parsing import ( + parse_time_string_to_ms, +) from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager @@ -37,10 +44,14 @@ class IdentifyMissingDataInterval(MonitoringBaseInterface): mad_multiplier (float, optional): Multiplier for MAD to calculate tolerance. Default is 3. min_tolerance (str, optional): Minimum tolerance for pattern-based detection (e.g., '100ms'). Default is '10ms'. - Example: - ```python - from rtdip_sdk.pipelines.monitoring.spark.data_quality import IdentifyMissingDataInterval - from pyspark.sql import SparkSession + Returns: + df (pyspark.sql.Dataframe): Returns the original PySparkDataFrame without changes. + + Example + -------- + ```python + from rtdip_sdk.pipelines.monitoring.spark.data_manipulation import IdentifyMissingDataInterval + from pyspark.sql import SparkSession missing_data_monitor = IdentifyMissingDataInterval( df=df, diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py similarity index 97% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py rename to src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py index dd81ae579..864ff5f09 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/identify_missing_data_pattern.py +++ b/src/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.py @@ -8,6 +8,16 @@ from ...._pipeline_utils.models import Libraries, SystemType from ....logging.logger_manager import LoggerManager from ....utilities.spark.time_string_parsing import parse_time_string_to_ms +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import ( + MonitoringBaseInterface, +) +from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import ( + Libraries, + SystemType, +) +from src.sdk.python.rtdip_sdk.pipelines.utilities.spark.time_string_parsing import ( + parse_time_string_to_ms, +) class IdentifyMissingDataPattern(MonitoringBaseInterface): diff --git a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py deleted file mode 100644 index 24cb1a77a..000000000 --- a/src/sdk/python/rtdip_sdk/pipelines/data_wranglers/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .spark.data_quality.duplicate_detection import * -from .spark.data_quality.normalization.normalization import * -from .spark.data_quality.normalization.normalization_mean import * -from .spark.data_quality.normalization.normalization_minmax import * -from .spark.data_quality.normalization.normalization_zscore import * -from .spark.data_quality.normalization.denormalization import * -from .spark.data_quality.prediction.arima import * -from .spark.data_quality.missing_value_imputation import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py deleted file mode 100644 index 4e3c1f154..000000000 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .spark.data_quality.great_expectations_data_quality import * -from .spark.data_quality.identify_missing_data_interval import * -from .spark.data_quality.identify_missing_data_pattern import * -from .spark.data_quality.flatline_detection import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py index 60396d776..5178ffd5e 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/__init__.py @@ -1,2 +1,16 @@ -from .cols_to_vector import * +# Copyright 2024 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .columns_to_vector import * from .polynomial_features import * diff --git a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/cols_to_vector.py b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py similarity index 98% rename from src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/cols_to_vector.py rename to src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py index 369a87cfe..e9da1241c 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/cols_to_vector.py +++ b/src/sdk/python/rtdip_sdk/pipelines/transformers/spark/machine_learning/columns_to_vector.py @@ -18,7 +18,7 @@ from ...interfaces import TransformerInterface -class ColsToVector(TransformerInterface): +class ColumnsToVector(TransformerInterface): """ Converts columns containing numbers to a column containing a vector. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/__init__.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py similarity index 95% rename from src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py index 5305a429e..74ed9fc2f 100644 --- a/src/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py index 2f2d4b94a..c75137c47 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_arima.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_arima.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta import numpy as np import pandas as pd @@ -21,7 +20,9 @@ from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers import ArimaPrediction +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.prediction.arima import ( + ArimaPrediction, +) @pytest.fixture(scope="session") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py similarity index 94% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py index 7ce73fb42..13afcda15 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_duplicate_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_duplicate_detection.py @@ -14,7 +14,9 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.duplicate_detection import ( +from pyspark.sql.dataframe import DataFrame + +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.duplicate_detection import ( DuplicateDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py index 472df9f3a..91f35a7b7 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_interval_filtering.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_interval_filtering.py @@ -16,7 +16,7 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.interval_filtering import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.interval_filtering import ( IntervalFiltering, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py index 9474645a9..b7a5ab860 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_k_sigma_anomaly_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_k_sigma_anomaly_detection.py @@ -1,6 +1,6 @@ from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.k_sigma_anomaly_detection import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.k_sigma_anomaly_detection import ( KSigmaAnomalyDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py index b45bb5e41..f9517a4a3 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_missing_value_imputation.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_missing_value_imputation.py @@ -18,7 +18,7 @@ from pyspark.sql.functions import col, unix_timestamp, abs as A from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers.spark.data_quality.missing_value_imputation import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.missing_value_imputation import ( MissingValueImputation, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py similarity index 88% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py index 6d6493ff7..a994792f9 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_normalization.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_normalization.py @@ -11,19 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta import pytest from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame -from src.sdk.python.rtdip_sdk.pipelines.data_wranglers import ( - NormalizationBaseClass, +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.denormalization import ( Denormalization, - NormalizationMinMax, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.normalization import ( + NormalizationBaseClass, +) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.normalization_mean import ( NormalizationMean, ) +from src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.data_quality.normalization.normalization_minmax import ( + NormalizationMinMax, +) @pytest.fixture(scope="session") diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_one_hot_encoding.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_one_hot_encoding.py similarity index 100% rename from tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/test_one_hot_encoding.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/test_one_hot_encoding.py diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py similarity index 95% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py index 5305a429e..74ed9fc2f 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/__init__ .py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/__init__ .py @@ -1,4 +1,4 @@ -# Copyright 2022 RTDIP +# Copyright 2024 RTDIP # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py new file mode 100644 index 000000000..74ed9fc2f --- /dev/null +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 RTDIP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py index caf67f08f..f14690861 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_check_value_ranges.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_check_value_ranges.py @@ -3,7 +3,7 @@ from io import StringIO import logging -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.check_value_ranges import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.check_value_ranges import ( CheckValueRanges, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py similarity index 97% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py index 37d097baa..ed5eb688c 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_flatline_detection.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_flatline_detection.py @@ -1,6 +1,6 @@ import pytest from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.flatline_detection import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.flatline_detection import ( FlatlineDetection, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py similarity index 96% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py index 00bb57902..51584ba4b 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_great_expectations_data_quality.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_great_expectations_data_quality.py @@ -1,8 +1,7 @@ -import pytest from pytest_mock import MockerFixture -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import SparkSession -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.great_expectations_data_quality import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.great_expectations_data_quality import ( GreatExpectationsDataQuality, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py similarity index 99% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py index 858dfae12..7725d5bd6 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_interval.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_interval.py @@ -2,7 +2,7 @@ from pyspark.sql import SparkSession from src.sdk.python.rtdip_sdk.pipelines.logging.logger_manager import LoggerManager -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.identify_missing_data_interval import ( +from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.monitoring.identify_missing_data_interval import ( IdentifyMissingDataInterval, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py similarity index 98% rename from tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py rename to tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py index 2a3c39bc1..fa170dafc 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/test_identify_missing_data_pattern.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/data_quality/monitoring/spark/test_identify_missing_data_pattern.py @@ -8,7 +8,7 @@ from pyspark.sql.types import StructType, StructField, StringType -from src.sdk.python.rtdip_sdk.pipelines.monitoring.spark.data_quality.identify_missing_data_pattern import ( +from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.identify_missing_data_pattern import ( IdentifyMissingDataPattern, ) diff --git a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/data_wranglers/spark/data_quality/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py b/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py index 493426d9e..3bae9691c 100644 --- a/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py +++ b/tests/sdk/python/rtdip_sdk/pipelines/machine_learning/spark/test_linear_regression.py @@ -25,8 +25,8 @@ from src.sdk.python.rtdip_sdk.pipelines.machine_learning.spark.linear_regression import ( LinearRegression, ) -from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.cols_to_vector import ( - ColsToVector, +from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.columns_to_vector import ( + ColumnsToVector, ) from src.sdk.python.rtdip_sdk.pipelines.transformers.spark.machine_learning.polynomial_features import ( PolynomialFeatures, @@ -170,8 +170,8 @@ def sample_data(spark): # Test cases def test_cols_to_vector(sample_data): df = sample_data - # Pass the DataFrame to ColsToVector - cols_to_vector = ColsToVector(df=df, input_cols=["Value"], output_col="features") + # Pass the DataFrame to ColumnsToVector + cols_to_vector = ColumnsToVector(df=df, input_cols=["Value"], output_col="features") transformed_df = cols_to_vector.transform() assert "features" in transformed_df.columns @@ -180,8 +180,8 @@ def test_cols_to_vector(sample_data): def test_polynomial_features(sample_data): df = sample_data - # Convert 'Value' to a vector using ColsToVector - cols_to_vector = ColsToVector(df=df, input_cols=["Value"], output_col="features") + # Convert 'Value' to a vector using ColumnsToVector + cols_to_vector = ColumnsToVector(df=df, input_cols=["Value"], output_col="features") vectorized_df = cols_to_vector.transform() polynomial_features = PolynomialFeatures( @@ -197,8 +197,8 @@ def test_polynomial_features(sample_data): def test_linear_regression(sample_data): df = sample_data - # Use ColsToVector to assemble features into a single vector column - cols_to_vector = ColsToVector(df=df, input_cols=["Value"], output_col="features") + # Use ColumnsToVector to assemble features into a single vector column + cols_to_vector = ColumnsToVector(df=df, input_cols=["Value"], output_col="features") df = cols_to_vector.transform() linear_regression = LinearRegression( df, features_col="features", label_col="Value", prediction_col="prediction" diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py b/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py deleted file mode 100644 index 5305a429e..000000000 --- a/tests/sdk/python/rtdip_sdk/pipelines/monitoring/spark/data_quality/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2022 RTDIP -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.