From fd3bc5943060d2ac696451a6628c8a16d8af6498 Mon Sep 17 00:00:00 2001 From: dombean <46692370+dombean@users.noreply.github.com> Date: Fri, 10 May 2024 11:54:39 +0100 Subject: [PATCH 1/4] Rename Module: cdsw to cdp --- docs/reference.md | 12 +++---- rdsa_utils/{cdsw => cdp}/__init__.py | 0 rdsa_utils/{cdsw => cdp}/helpers/__init__.py | 0 .../{cdsw => cdp}/helpers/hdfs_utils.py | 0 rdsa_utils/{cdsw => cdp}/helpers/impala.py | 0 rdsa_utils/{cdsw => cdp}/io/__init__.py | 0 rdsa_utils/{cdsw => cdp}/io/input.py | 2 +- rdsa_utils/{cdsw => cdp}/io/output.py | 6 ++-- .../{cdsw => cdp}/io/pipeline_runlog.py | 2 +- tests/cdsw/helpers/test_hdfs_utils.py | 6 ++-- tests/cdsw/helpers/test_impala.py | 2 +- tests/cdsw/io/test_cdsw_input.py | 4 +-- tests/cdsw/io/test_cdsw_output.py | 34 +++++++++---------- tests/cdsw/io/test_pipeline_runlog.py | 32 ++++++++--------- 14 files changed, 50 insertions(+), 50 deletions(-) rename rdsa_utils/{cdsw => cdp}/__init__.py (100%) rename rdsa_utils/{cdsw => cdp}/helpers/__init__.py (100%) rename rdsa_utils/{cdsw => cdp}/helpers/hdfs_utils.py (100%) rename rdsa_utils/{cdsw => cdp}/helpers/impala.py (100%) rename rdsa_utils/{cdsw => cdp}/io/__init__.py (100%) rename rdsa_utils/{cdsw => cdp}/io/input.py (99%) rename rdsa_utils/{cdsw => cdp}/io/output.py (98%) rename rdsa_utils/{cdsw => cdp}/io/pipeline_runlog.py (99%) diff --git a/docs/reference.md b/docs/reference.md index 92b6c82..e800ccb 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -9,13 +9,13 @@ reference for the technical implementation of the`rdsa-utils` codebase. ::: rdsa_utils.typing ::: rdsa_utils.validation -## CDSW +## cdp -::: rdsa_utils.cdsw.helpers.hdfs_utils -::: rdsa_utils.cdsw.helpers.impala -::: rdsa_utils.cdsw.io.pipeline_runlog -::: rdsa_utils.cdsw.io.input -::: rdsa_utils.cdsw.io.output +::: rdsa_utils.cdp.helpers.hdfs_utils +::: rdsa_utils.cdp.helpers.impala +::: rdsa_utils.cdp.io.pipeline_runlog +::: rdsa_utils.cdp.io.input +::: rdsa_utils.cdp.io.output ## GCP diff --git a/rdsa_utils/cdsw/__init__.py b/rdsa_utils/cdp/__init__.py similarity index 100% rename from rdsa_utils/cdsw/__init__.py rename to rdsa_utils/cdp/__init__.py diff --git a/rdsa_utils/cdsw/helpers/__init__.py b/rdsa_utils/cdp/helpers/__init__.py similarity index 100% rename from rdsa_utils/cdsw/helpers/__init__.py rename to rdsa_utils/cdp/helpers/__init__.py diff --git a/rdsa_utils/cdsw/helpers/hdfs_utils.py b/rdsa_utils/cdp/helpers/hdfs_utils.py similarity index 100% rename from rdsa_utils/cdsw/helpers/hdfs_utils.py rename to rdsa_utils/cdp/helpers/hdfs_utils.py diff --git a/rdsa_utils/cdsw/helpers/impala.py b/rdsa_utils/cdp/helpers/impala.py similarity index 100% rename from rdsa_utils/cdsw/helpers/impala.py rename to rdsa_utils/cdp/helpers/impala.py diff --git a/rdsa_utils/cdsw/io/__init__.py b/rdsa_utils/cdp/io/__init__.py similarity index 100% rename from rdsa_utils/cdsw/io/__init__.py rename to rdsa_utils/cdp/io/__init__.py diff --git a/rdsa_utils/cdsw/io/input.py b/rdsa_utils/cdp/io/input.py similarity index 99% rename from rdsa_utils/cdsw/io/input.py rename to rdsa_utils/cdp/io/input.py index dd85ca8..d62af6d 100644 --- a/rdsa_utils/cdsw/io/input.py +++ b/rdsa_utils/cdp/io/input.py @@ -1,4 +1,4 @@ -"""Read inputs on CDSW.""" +"""Read inputs on CDP.""" import logging from typing import Tuple diff --git a/rdsa_utils/cdsw/io/output.py b/rdsa_utils/cdp/io/output.py similarity index 98% rename from rdsa_utils/cdsw/io/output.py rename to rdsa_utils/cdp/io/output.py index 75845f3..b3c746f 100644 --- a/rdsa_utils/cdsw/io/output.py +++ b/rdsa_utils/cdp/io/output.py @@ -1,4 +1,4 @@ -"""Write outputs on CDSW.""" +"""Write outputs on CDP.""" import logging from typing import Union @@ -7,8 +7,8 @@ from pyspark.sql import functions as F from pyspark.sql.utils import AnalysisException -from rdsa_utils.cdsw.helpers.hdfs_utils import delete_path, file_exists, rename -from rdsa_utils.cdsw.io.input import load_and_validate_table +from rdsa_utils.cdp.helpers.hdfs_utils import delete_path, file_exists, rename +from rdsa_utils.cdp.io.input import load_and_validate_table from rdsa_utils.exceptions import ( ColumnNotInDataframeError, DataframeEmptyError, diff --git a/rdsa_utils/cdsw/io/pipeline_runlog.py b/rdsa_utils/cdp/io/pipeline_runlog.py similarity index 99% rename from rdsa_utils/cdsw/io/pipeline_runlog.py rename to rdsa_utils/cdp/io/pipeline_runlog.py index 3e3b08e..c5d1b3d 100644 --- a/rdsa_utils/cdsw/io/pipeline_runlog.py +++ b/rdsa_utils/cdp/io/pipeline_runlog.py @@ -11,7 +11,7 @@ from pyspark.sql import functions as F from pyspark.sql.utils import AnalysisException -from rdsa_utils.cdsw.helpers.hdfs_utils import create_txt_from_string +from rdsa_utils.cdp.helpers.hdfs_utils import create_txt_from_string logger = logging.getLogger(__name__) diff --git a/tests/cdsw/helpers/test_hdfs_utils.py b/tests/cdsw/helpers/test_hdfs_utils.py index 8f004b8..bdb7a5b 100644 --- a/tests/cdsw/helpers/test_hdfs_utils.py +++ b/tests/cdsw/helpers/test_hdfs_utils.py @@ -5,7 +5,7 @@ import pytest -from rdsa_utils.cdsw.helpers.hdfs_utils import ( +from rdsa_utils.cdp.helpers.hdfs_utils import ( _perform, change_permissions, copy, @@ -240,9 +240,9 @@ def test_create_txt_from_string( ): """Verify 'echo | hadoop fs -put -' command execution by create_txt_from_string.""" with patch('subprocess.call') as subprocess_mock, patch( - 'rdsa_utils.cdsw.helpers.hdfs_utils.file_exists', + 'rdsa_utils.cdp.helpers.hdfs_utils.file_exists', ) as file_exists_mock, patch( - 'rdsa_utils.cdsw.helpers.hdfs_utils.delete_file', + 'rdsa_utils.cdp.helpers.hdfs_utils.delete_file', ) as delete_file_mock: file_exists_mock.return_value = ( replace # Assume file exists if replace is True diff --git a/tests/cdsw/helpers/test_impala.py b/tests/cdsw/helpers/test_impala.py index cf2a659..75f0fb7 100644 --- a/tests/cdsw/helpers/test_impala.py +++ b/tests/cdsw/helpers/test_impala.py @@ -1,7 +1,7 @@ """Tests for impala.py module.""" import subprocess -from rdsa_utils.cdsw.helpers.impala import invalidate_impala_metadata +from rdsa_utils.cdp.helpers.impala import invalidate_impala_metadata class TestInvalidateImpalaMetadata: diff --git a/tests/cdsw/io/test_cdsw_input.py b/tests/cdsw/io/test_cdsw_input.py index 998638d..40d2c28 100644 --- a/tests/cdsw/io/test_cdsw_input.py +++ b/tests/cdsw/io/test_cdsw_input.py @@ -1,11 +1,11 @@ -"""Tests for the cdsw/io/input.py module.""" +"""Tests for the cdp/io/input.py module.""" from unittest.mock import MagicMock import pytest from pyspark.sql import DataFrame as SparkDF +from rdsa_utils.cdp.io.input import * from rdsa_utils.exceptions import DataframeEmptyError -from rdsa_utils.cdsw.io.input import * class TestGetCurrentDatabase: diff --git a/tests/cdsw/io/test_cdsw_output.py b/tests/cdsw/io/test_cdsw_output.py index adcf790..70d35fd 100644 --- a/tests/cdsw/io/test_cdsw_output.py +++ b/tests/cdsw/io/test_cdsw_output.py @@ -1,4 +1,4 @@ -"""Tests for the cdsw/io/output.py module.""" +"""Tests for the cdp/io/output.py module.""" from typing import Callable from unittest.mock import Mock, patch @@ -6,7 +6,7 @@ from pyspark.sql import DataFrame as SparkDF from pyspark.sql import types as T -from rdsa_utils.cdsw.io.output import * +from rdsa_utils.cdp.io.output import * class TestInsertDataFrameToHiveTable: @@ -137,8 +137,8 @@ def mock_df(self): mock_df.columns = ['run_id', 'data'] return mock_df - @patch('rdsa_utils.cdsw.io.output.load_and_validate_table') - @patch('rdsa_utils.cdsw.io.output.insert_df_to_hive_table') + @patch('rdsa_utils.cdp.io.output.load_and_validate_table') + @patch('rdsa_utils.cdp.io.output.insert_df_to_hive_table') def test_write_and_read_hive_table_success( self, mock_insert, @@ -225,10 +225,10 @@ def mock_df(self) -> Mock: """Fixture for mocked Spark DataFrame.""" return Mock(spec=SparkDF) - @patch('rdsa_utils.cdsw.io.output.logger') - @patch('rdsa_utils.cdsw.io.output.delete_path') - @patch('rdsa_utils.cdsw.io.output.rename') - @patch('rdsa_utils.cdsw.io.output.file_exists') + @patch('rdsa_utils.cdp.io.output.logger') + @patch('rdsa_utils.cdp.io.output.delete_path') + @patch('rdsa_utils.cdp.io.output.rename') + @patch('rdsa_utils.cdp.io.output.file_exists') def test_save_csv_to_hdfs_success( self, mock_file_exists, @@ -251,10 +251,10 @@ def test_save_csv_to_hdfs_success( mock_delete_path.assert_called_once() assert mock_logger.info.call_count > 0 - @patch('rdsa_utils.cdsw.io.output.file_exists') - @patch('rdsa_utils.cdsw.io.output.rename') - @patch('rdsa_utils.cdsw.io.output.delete_path') - @patch('rdsa_utils.cdsw.io.output.logger') + @patch('rdsa_utils.cdp.io.output.file_exists') + @patch('rdsa_utils.cdp.io.output.rename') + @patch('rdsa_utils.cdp.io.output.delete_path') + @patch('rdsa_utils.cdp.io.output.logger') def test_overwriting_existing_file( self, mock_logger, @@ -275,7 +275,7 @@ def test_overwriting_existing_file( mock_rename.assert_called_once() - @patch('rdsa_utils.cdsw.io.output.file_exists') + @patch('rdsa_utils.cdp.io.output.file_exists') def test_save_csv_to_hdfs_file_exists_error( self, mock_file_exists, @@ -307,10 +307,10 @@ def test_save_csv_to_hdfs_invalid_file_name(self, mock_df): ('/user/hdfs/test/path', '/user/hdfs/test/path/should_write.csv'), ], ) - @patch('rdsa_utils.cdsw.io.output.file_exists') - @patch('rdsa_utils.cdsw.io.output.rename') - @patch('rdsa_utils.cdsw.io.output.delete_path') - @patch('rdsa_utils.cdsw.io.output.logger') + @patch('rdsa_utils.cdp.io.output.file_exists') + @patch('rdsa_utils.cdp.io.output.rename') + @patch('rdsa_utils.cdp.io.output.delete_path') + @patch('rdsa_utils.cdp.io.output.logger') def test_file_path_schemes( self, mock_logger, diff --git a/tests/cdsw/io/test_pipeline_runlog.py b/tests/cdsw/io/test_pipeline_runlog.py index 429c7c6..3cd73c3 100644 --- a/tests/cdsw/io/test_pipeline_runlog.py +++ b/tests/cdsw/io/test_pipeline_runlog.py @@ -3,7 +3,7 @@ import pytest from pyspark.sql import DataFrame -from rdsa_utils.cdsw.io.pipeline_runlog import ( +from rdsa_utils.cdp.io.pipeline_runlog import ( _get_run_ids, _write_entry, add_runlog_entry, @@ -134,7 +134,7 @@ def test_reserve_id_non_empty(self, mocker): # Mock _write_entry function mock_write_entry = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._write_entry', + 'rdsa_utils.cdp.io.pipeline_runlog._write_entry', ) # Mock pyspark.sql.functions.max @@ -173,7 +173,7 @@ def test_reserve_id_edge_case(self, mocker): # Mock _write_entry function mock_write_entry = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._write_entry', + 'rdsa_utils.cdp.io.pipeline_runlog._write_entry', ) # Mock pyspark.sql.functions.max @@ -299,7 +299,7 @@ def test_get_last_run_id_general_pipeline_non_empty(self, mocker): # Patch _get_run_ids function and return a Mock object get_run_ids_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._get_run_ids', + 'rdsa_utils.cdp.io.pipeline_runlog._get_run_ids', return_value=[3, 2, 1], ) @@ -319,7 +319,7 @@ def test_get_last_run_id_specific_pipeline_empty(self, mocker): # Patch _get_run_ids function and return a Mock object get_run_ids_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._get_run_ids', + 'rdsa_utils.cdp.io.pipeline_runlog._get_run_ids', return_value=[], ) @@ -352,7 +352,7 @@ def test_penultimate_run_id_non_empty(self, mocker): # Patch _get_run_ids function and return a Mock object get_run_ids_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._get_run_ids', + 'rdsa_utils.cdp.io.pipeline_runlog._get_run_ids', return_value=[3, 2, 1], ) @@ -387,7 +387,7 @@ def test_penultimate_run_id_edge_cases(self, mocker): # Patch _get_run_ids function and return a Mock object get_run_ids_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._get_run_ids', + 'rdsa_utils.cdp.io.pipeline_runlog._get_run_ids', return_value=[1], ) @@ -409,7 +409,7 @@ def test_penultimate_run_id_edge_cases(self, mocker): # Patch _get_run_ids function and return a Mock object get_run_ids_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._get_run_ids', + 'rdsa_utils.cdp.io.pipeline_runlog._get_run_ids', return_value=[], ) @@ -512,16 +512,16 @@ def test_add_runlog_entry(self, mocker): # Mock reserve_id, create_runlog_entry, _write_entry reserve_id_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog.reserve_id', + 'rdsa_utils.cdp.io.pipeline_runlog.reserve_id', return_value=1, ) entry_mock = mocker.Mock() create_runlog_entry_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog.create_runlog_entry', + 'rdsa_utils.cdp.io.pipeline_runlog.create_runlog_entry', return_value=entry_mock, ) _write_entry_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._write_entry', + 'rdsa_utils.cdp.io.pipeline_runlog._write_entry', ) # Call function @@ -565,11 +565,11 @@ def test_add_runlog_entry_specified_id(self, mocker): # Mock create_runlog_entry, _write_entry entry_mock = mocker.Mock() create_runlog_entry_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog.create_runlog_entry', + 'rdsa_utils.cdp.io.pipeline_runlog.create_runlog_entry', return_value=entry_mock, ) _write_entry_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._write_entry', + 'rdsa_utils.cdp.io.pipeline_runlog._write_entry', ) # Call function @@ -613,11 +613,11 @@ def test_write_runlog_file(self, mocker): # Mock _parse_runlog_as_string and create_txt_from_string parse_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._parse_runlog_as_string', + 'rdsa_utils.cdp.io.pipeline_runlog._parse_runlog_as_string', ) parse_mock.return_value = 'test metadata' create_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog.create_txt_from_string', + 'rdsa_utils.cdp.io.pipeline_runlog.create_txt_from_string', ) # Call function @@ -639,7 +639,7 @@ def test_write_runlog_file_edge_case(self, mocker): # Mock _parse_runlog_as_string parse_mock = mocker.patch( - 'rdsa_utils.cdsw.io.pipeline_runlog._parse_runlog_as_string', + 'rdsa_utils.cdp.io.pipeline_runlog._parse_runlog_as_string', ) parse_mock.return_value = 'test metadata' From 11d83447a426562ce5d8a76e5dc67d807e5c19e8 Mon Sep 17 00:00:00 2001 From: dombean <46692370+dombean@users.noreply.github.com> Date: Fri, 10 May 2024 11:55:39 +0100 Subject: [PATCH 2/4] Update CHANGELOG.md --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a36813..882d93b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,8 @@ and this project adheres to [semantic versioning](https://semver.org/spec/v2.0.0 ### Added ### Changed - +- **Breaking Change**: Renamed module `cdsw` to `cdp` (Cloudera Data Platform). + ### Deprecated ### Fixed From 6add038da57ee74912f4a0e7b7238b14d79a537b Mon Sep 17 00:00:00 2001 From: dombean <46692370+dombean@users.noreply.github.com> Date: Fri, 10 May 2024 11:57:27 +0100 Subject: [PATCH 3/4] Capitalise CDP --- docs/reference.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index e800ccb..b1b09d1 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -9,7 +9,7 @@ reference for the technical implementation of the`rdsa-utils` codebase. ::: rdsa_utils.typing ::: rdsa_utils.validation -## cdp +## CDP ::: rdsa_utils.cdp.helpers.hdfs_utils ::: rdsa_utils.cdp.helpers.impala From 2eff3a6dbe375c87b298322c8a2df6985e9794eb Mon Sep 17 00:00:00 2001 From: dombean <46692370+dombean@users.noreply.github.com> Date: Fri, 10 May 2024 12:01:53 +0100 Subject: [PATCH 4/4] Rename Module in tests/: cdsw to cdp --- tests/{cdsw => cdp}/helpers/test_hdfs_utils.py | 0 tests/{cdsw => cdp}/helpers/test_impala.py | 0 tests/{cdsw => cdp}/io/test_cdsw_input.py | 0 tests/{cdsw => cdp}/io/test_cdsw_output.py | 0 tests/{cdsw => cdp}/io/test_pipeline_runlog.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename tests/{cdsw => cdp}/helpers/test_hdfs_utils.py (100%) rename tests/{cdsw => cdp}/helpers/test_impala.py (100%) rename tests/{cdsw => cdp}/io/test_cdsw_input.py (100%) rename tests/{cdsw => cdp}/io/test_cdsw_output.py (100%) rename tests/{cdsw => cdp}/io/test_pipeline_runlog.py (100%) diff --git a/tests/cdsw/helpers/test_hdfs_utils.py b/tests/cdp/helpers/test_hdfs_utils.py similarity index 100% rename from tests/cdsw/helpers/test_hdfs_utils.py rename to tests/cdp/helpers/test_hdfs_utils.py diff --git a/tests/cdsw/helpers/test_impala.py b/tests/cdp/helpers/test_impala.py similarity index 100% rename from tests/cdsw/helpers/test_impala.py rename to tests/cdp/helpers/test_impala.py diff --git a/tests/cdsw/io/test_cdsw_input.py b/tests/cdp/io/test_cdsw_input.py similarity index 100% rename from tests/cdsw/io/test_cdsw_input.py rename to tests/cdp/io/test_cdsw_input.py diff --git a/tests/cdsw/io/test_cdsw_output.py b/tests/cdp/io/test_cdsw_output.py similarity index 100% rename from tests/cdsw/io/test_cdsw_output.py rename to tests/cdp/io/test_cdsw_output.py diff --git a/tests/cdsw/io/test_pipeline_runlog.py b/tests/cdp/io/test_pipeline_runlog.py similarity index 100% rename from tests/cdsw/io/test_pipeline_runlog.py rename to tests/cdp/io/test_pipeline_runlog.py