diff --git a/.coverage b/.coverage index 644d059..4dedabd 100644 Binary files a/.coverage and b/.coverage differ diff --git a/README.md b/README.md index 64dff69..de504d3 100644 --- a/README.md +++ b/README.md @@ -54,17 +54,6 @@ The API documentation is available at ## Parameters -- Depth of layers - - The depth of layer is one key consideration in converting a single config - file into the folder structure. - - The default behaviour is to crate nested folder until a single file - contains no sub-keys. However, in some cases this might not be ideal. - Therefore, user can control the depth of layer through the following - parameter. - ```bash - # default is -1, file does not contain sub-key - configen --config-path=config_path --depth=-1 - ``` - Ignore - User can choose to ignore some keys and not expand into sub-folders ```bash diff --git a/configen/__init__.py b/configen/__init__.py index 49fa750..ae62e29 100644 --- a/configen/__init__.py +++ b/configen/__init__.py @@ -2,7 +2,12 @@ .. include:: ../README.md """ from importlib.metadata import version +from configen.configen import cli +from configen.base_parser import Parser +from configen.json_parser import JsonParser +from configen.yaml_parser import YamlParser __author__ = "Ling" __email__ = "lingjie@u.nus.edu" __version__ = version("configen") +__all__ = [cli, Parser, JsonParser, YamlParser] diff --git a/configen/base_parser.py b/configen/base_parser.py index 869b1d7..6d9d8c4 100644 --- a/configen/base_parser.py +++ b/configen/base_parser.py @@ -1,17 +1,39 @@ """Defines the base parser requirements.""" from __future__ import annotations -from typing import Union import abc +import os +import re +from collections import deque +from typing import Tuple, Union -class Parser(abc.ABC): +class Parser: """The base Parser.""" - extension: str + + extension: str = "" """The parser file extension.""" - config: dict + config: dict = {} """The loaded config.""" + def __init__(self, config: Union[dict, None] = None): + """Initiate object with optional initial config.""" + if config is not None: + assert isinstance( + config, dict + ), f"Expected config to be dict get {type(config)}" + self.config = config + + def __eq__(self, parser: object) -> bool: + """Compares if given parser is same as self.""" + if isinstance(parser, Parser): + return parser.extension == self.extension + return False + + def __str__(self): + """Returns the print value.""" + return self.extension + def _append_extension(self, input_path: str) -> str: """Output the input_path with the file extension. @@ -26,9 +48,11 @@ def _append_extension(self, input_path: str) -> str: `_check_extension("config.json")` -> config.json """ - # TODO: implement better extension checks - if input_path.split(".")[-1] != self.extension: - input_path += f".{self.extension}" + assert isinstance(input_path, str), f"expected type str got {type(input_path)}" + + filename, file_extension = os.path.splitext(input_path) + if file_extension != "." + self.extension: + input_path += "." + self.extension return input_path @abc.abstractmethod @@ -63,7 +87,22 @@ def _write_method(self, filename: str) -> Parser: """ pass - def load(self, config: Union[str, dict]) -> Parser: + @staticmethod + def _search_match(name: str, ignored: Tuple[str]) -> bool: + """Checks if the name is present in the ignored list.""" + assert isinstance(name, str), f"Expected name as str, get {type(name)}" + assert isinstance( + ignored, tuple + ), f"Expected ignored as tuple, get {type(name)}" + + for ignore in ignored: + # if there is a regex match, return true + result = re.search(ignore, name) + if isinstance(result, re.Match): + return True + return False + + def load(self, config: Union[str, dict, None], ignored: Tuple[str] = ()) -> Parser: """Loads the config (single, or multiple files, or dict). Params: @@ -73,6 +112,8 @@ def load(self, config: Union[str, dict]) -> Parser: 2. filepath for a folder of configs 3. dictionary containing the config itself + ignored: list of regex match strings to ignore in file names + Returns: self with the config loaded in memory @@ -83,9 +124,74 @@ def load(self, config: Union[str, dict]) -> Parser: dictionary: `load({"name": "config"}) """ - pass + if config is not None: + assert isinstance( + config, (str, dict) + ), f"expected (str, dict) got {type(config)}" + + if isinstance(ignored, str): + ignored = (ignored,) + + assert isinstance(ignored, tuple), "expected ignored as tuple, got {type(ignored)}" + + # if config is None, then remove the stored config + if config is None: + self.config = None + return self + + # if given dictionary then stores it and end + if isinstance(config, dict): + self.config = config + return self + + filename, file_extension = os.path.splitext(config) + # if the config is a single config + if file_extension == "." + self.extension: + self.config = self._load_method(config) + return self + + # idea: iterate through the root folder, parse all configs + # stores the folder into a queue, then literately retrieve queue to + # maintain folder hierarchy + files = os.listdir(config) + queue: deque[str] = deque() + + if self.config is None: + self.config = {} + + # first iteration to get the depth 1 keys and folders + for file in files: + # skip those in the ignored + if self._search_match(file, ignored): + continue + filename, file_extension = os.path.splitext(file) + filepath = os.path.join(config, file) + if file_extension == "." + self.extension: + self.config.update(self._load_method(filepath)) + elif os.path.isdir(filepath): + queue.append(filepath) + + # while queue is not empty, repeat the procedure + while queue: + folder = queue.pop() + files = os.listdir(folder) + for file in files: + # skip those in the ignored + if self._search_match(file, ignored): + continue + filename, file_extension = os.path.splitext(file) + filepath = os.path.join(folder, file) + base_folder = os.path.basename(folder) + if base_folder not in self.config: + self.config[base_folder] = {} + if file_extension == "." + self.extension: + self.config[base_folder].update(self._load_method(filepath)) + elif os.path.isdir(filepath): + queue.append(filepath) - def write(self, filename: str, config: Union[str, dict, None]) -> Parser: + return self + + def write(self, filename: str, config: Union[str, dict, None] = None) -> Parser: """Writs the config to file. Parms: @@ -93,6 +199,9 @@ def write(self, filename: str, config: Union[str, dict, None]) -> Parser: config: the config file, if not provided use config stored in object + depth: how deep should we go, if -1 then every config file does not + contain sub-keys else the max folder layer is the depth parameter. + Returns: self with config written to file @@ -101,30 +210,63 @@ def write(self, filename: str, config: Union[str, dict, None]) -> Parser: `write("config.json", {"name": "config1"})` """ - pass + if config is not None: + assert isinstance( + config, (str, dict) + ), f"expected str, dict, None got {type(config)}" + + # if given config, need to store the old config and restore later + if config is None: + # if no config is given just replace back the old config + self_config, self.config = self.config, self.config + + else: + # if given config + self_config, self.config = self.config, config + + self._write_method(filename) + + # restore config + self = self.load(self_config) - def convert(self, config_path: str, filename: str, parser: type(Parser)) -> Parser: + return self + + def convert( + self, filename: str, parser: type[Parser], config_path: Union[str, None] = None + ) -> Parser: """Converts the config file into another file extension. Params: - config_path: file path to the config file - filename: the file path to be written as parser: the parser to be used for conversion + config_path: file path to the config file, if no path is given then + use the config stored in self + Returns: self Example: `convert("config.json", "config.yml", YamlPaser)` """ + if config_path is not None: + assert isinstance( + config_path, str + ), f"expected str or None got {type(config_path)}" + assert isinstance(filename, str), f"expected str got {type(filename)}" + assert isinstance(parser, Parser), f"expected ktr got {type(parser)}" + # ensure the file extension are correct - config_path = self._append_extension(config_path) - filename = parser.append_extension(filename) + if config_path is not None: + config_path = self._append_extension(config_path) + filename = parser._append_extension(filename) # load the config from given path - config = self.load(config_path) + if config_path is not None: + config = self.load(config_path).config + else: + config = self.config # writes config based on the given parser parser.write(filename=filename, config=config) diff --git a/configen/configen.py b/configen/configen.py index 255bf41..e9cd16f 100644 --- a/configen/configen.py +++ b/configen/configen.py @@ -1,2 +1,11 @@ """Entry point for program""" -print("Entry point") + + +def cli(): + """Command line interface entry point. + + Example: + configen config.json + """ + # TODO:: function should be able to read a mixture of config types + pass diff --git a/configen/json_parser.py b/configen/json_parser.py index 610f9db..95728a9 100644 --- a/configen/json_parser.py +++ b/configen/json_parser.py @@ -1,17 +1,13 @@ import json -from ruamel.yaml import YAML from configen.base_parser import Parser -_yaml = YAML() -_yaml.indent(mapping=2, sequence=4, offset=2) - class JsonParser(Parser): """Json parser.""" extension = "json" def _write_method(self, filename: str) -> Parser: - filename = self.check_extension(filename) + filename = self._append_extension(filename) with open(filename, "w") as file: json.dump(self.config, file, indent=4) @@ -19,9 +15,9 @@ def _write_method(self, filename: str) -> Parser: return self def _load_method(self, filename: str) -> dict: - filename = self.check_extension(filename) + filename = self._append_extension(filename) with open(filename, "r") as json_config: - config = json.load(json_config) + config = json.loads(json_config.read()) return config diff --git a/configen/yaml_parser.py b/configen/yaml_parser.py index 9615da7..7632461 100644 --- a/configen/yaml_parser.py +++ b/configen/yaml_parser.py @@ -1,4 +1,3 @@ -import os from ruamel.yaml import YAML from configen.base_parser import Parser @@ -10,19 +9,19 @@ class YamlParser(Parser): """Yaml parser.""" extension = "yml" - def _load_method(self, filename: str) -> dict: - filename = self.check_extension(filename) - - with open(filename, "r") as file: - config = _yaml.load(file.read()) - - return config - def _write_method(self, filename: str) -> Parser: # check if the given path ends with a yaml file extension - filename = self.check_extension(filename) + filename = self._append_extension(filename) with open(filename, "w") as file: _yaml.dump(self.config, file) return self + + def _load_method(self, filename: str) -> dict: + filename = self._append_extension(filename) + + with open(filename, "r") as file: + config = _yaml.load(file.read()) + + return config diff --git a/notebooks/README.md b/notebooks/README.md deleted file mode 100644 index bc314c2..0000000 --- a/notebooks/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Notebooks - -Notebooks are used to demonstrate the usage of configen and -some visualisation results. diff --git a/sample-config/config-json/function/function_1.json b/sample-config/config-json/function/function_1.json new file mode 100644 index 0000000..489cde4 --- /dev/null +++ b/sample-config/config-json/function/function_1.json @@ -0,0 +1,6 @@ +{ + "function1": { + "name": "transform", + "param": "col1" + } +} diff --git a/sample-config/config-json/function/function_2.json b/sample-config/config-json/function/function_2.json new file mode 100644 index 0000000..efc976b --- /dev/null +++ b/sample-config/config-json/function/function_2.json @@ -0,0 +1,6 @@ +{ + "function2": { + "name": "load", + "param": "col2" + } +} diff --git a/sample-config/config-json/main.json b/sample-config/config-json/main.json new file mode 100644 index 0000000..ea92ae2 --- /dev/null +++ b/sample-config/config-json/main.json @@ -0,0 +1,9 @@ +{ + "name": "config-01", + "training": true, + "parameters": { + "num_nodes": 200, + "num_samples": 100, + "max_time": 40 + } +} diff --git a/sample-config/config-json/pipeline.json b/sample-config/config-json/pipeline.json new file mode 100644 index 0000000..bc7a80a --- /dev/null +++ b/sample-config/config-json/pipeline.json @@ -0,0 +1,20 @@ +{ + "pipeline": [ + { + "name": "extraction", + "function": "etl.extraction" + }, + { + "name": "training", + "function": "model.training" + }, + { + "name": "evaluation", + "function": "model.evaluation" + }, + { + "name": "deployment", + "function": "cloud.deploy" + } + ] +} diff --git a/sample-config/config-mix/basic.yml b/sample-config/config-mix/basic.yml new file mode 100644 index 0000000..750c93d --- /dev/null +++ b/sample-config/config-mix/basic.yml @@ -0,0 +1,6 @@ +# Here is where we define the config basic info such as name +--- +name: "config-01" +# if to re-train the model +# if training is false, will simply deploy the model +training: true diff --git a/sample-config/config-mix/function/function1.json b/sample-config/config-mix/function/function1.json new file mode 100644 index 0000000..cd09734 --- /dev/null +++ b/sample-config/config-mix/function/function1.json @@ -0,0 +1,6 @@ +{ + "function1": { + "name": "transform", + "param": "col1" + } +} diff --git a/sample-config/config-mix/function/function2.yml b/sample-config/config-mix/function/function2.yml new file mode 100644 index 0000000..5c49950 --- /dev/null +++ b/sample-config/config-mix/function/function2.yml @@ -0,0 +1,5 @@ +# defining the load function +--- +function2: + name: "load" + param: "col" diff --git a/sample-config/config-mix/param.yml b/sample-config/config-mix/param.yml new file mode 100644 index 0000000..096effc --- /dev/null +++ b/sample-config/config-mix/param.yml @@ -0,0 +1,11 @@ +# storing the model training parameters +--- +parameters: + # number of compute nodes + num_nodes: 200 + # number of MC samples + num_samples: 100 + # the max model training time allowed + # early stopping is implemented. Therefore, this max time should be used as a + # safeguard for damage control instead of preventing over-fitting. + max_time: 40 diff --git a/sample-config/config-mix/pipeline.json b/sample-config/config-mix/pipeline.json new file mode 100644 index 0000000..1b817a1 --- /dev/null +++ b/sample-config/config-mix/pipeline.json @@ -0,0 +1,20 @@ +{ + "pipeline": [ + { + "name": "extraction", + "function": "etl.extraction" + }, + { + "name": "training", + "function": "model.training" + }, + { + "name": "evaluation", + "function": "model.evaluation" + }, + { + "name": "deployment", + "function": "cloud.deploy" + } + ] +} diff --git a/sample-config/config-yml/function/function_1.yml b/sample-config/config-yml/function/function_1.yml new file mode 100644 index 0000000..cb9df1f --- /dev/null +++ b/sample-config/config-yml/function/function_1.yml @@ -0,0 +1,4 @@ +--- +function1: + name: "transform" + param: "col1" diff --git a/sample-config/config-yml/function/function_2.yml b/sample-config/config-yml/function/function_2.yml new file mode 100644 index 0000000..d4ad8c6 --- /dev/null +++ b/sample-config/config-yml/function/function_2.yml @@ -0,0 +1,4 @@ +--- +function2: + name: "load" + param: "col2" diff --git a/sample-config/config-yml/main.yml b/sample-config/config-yml/main.yml new file mode 100644 index 0000000..cc1d708 --- /dev/null +++ b/sample-config/config-yml/main.yml @@ -0,0 +1,15 @@ +# Sample config +--- +name: "config-01" +# if to re-train the model +# if training is false, will simply deploy the model +training: true +parameters: + # number of compute nodes + num_nodes: 200 + # number of MC samples + num_samples: 100 + # the max model training time allowed + # early stopping is implemented. Therefore, this max time should be used as a + # safeguard for damage control instead of preventing over-fitting. + max_time: 40 diff --git a/sample-config/config-yml/pipeline.yml b/sample-config/config-yml/pipeline.yml new file mode 100644 index 0000000..5c94f25 --- /dev/null +++ b/sample-config/config-yml/pipeline.yml @@ -0,0 +1,12 @@ +# pipeline +--- +# model training pipeline +pipeline: + - name: "extraction" + function: "etl.extraction" + - name: "training" + function: "model.training" + - name: "evaluation" + function: "model.evaluation" + - name: "deployment" + function: "cloud.deploy" diff --git a/sample-config/sample.json b/sample-config/sample.json index 49a7f8d..dd6f1a2 100644 --- a/sample-config/sample.json +++ b/sample-config/sample.json @@ -23,5 +23,15 @@ "name": "deployment", "function": "cloud.deploy" } - ] + ], + "function": { + "function1": { + "name": "transform", + "param": "col1" + }, + "function2": { + "name": "load", + "param": "col2" + } + } } diff --git a/sample-config/sample.yml b/sample-config/sample.yml index f68253b..93fe99b 100644 --- a/sample-config/sample.yml +++ b/sample-config/sample.yml @@ -23,3 +23,10 @@ pipeline: function: "model.evaluation" - name: "deployment" function: "cloud.deploy" +function: + function1: + name: "transform" + param: "col1" + function2: + name: "load" + param: "col2" diff --git a/tests/test_configen.py b/tests/test_configen.py index 385d6d8..85b2192 100644 --- a/tests/test_configen.py +++ b/tests/test_configen.py @@ -4,11 +4,16 @@ class Test(unittest.TestCase): """Testing the entry point.""" + # TODO: def test_dummy(self: unittest.TestCase) -> None: """Dummy test.""" self.assertEqual(True, True) + def test_mixture(self): + """function should be able to read a mixture of config types.""" + pass + if __name__ == "__main__": unittest.main() diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000..c47804d --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,255 @@ +"""Test the parser.""" +import unittest +import tempfile # create temp config files +from contextlib import contextmanager +import os +from typing import Tuple + +from configen import Parser, JsonParser, YamlParser + + +class TestParser(unittest.TestCase): + """Perform unit test for the parsers.""" + + parsers: Tuple[Parser] = (JsonParser, YamlParser) + """The parsers to test.""" + + # ground truth + config_truth = { + "name": "config-01", + "training": True, + "parameters": { + "num_nodes": 200, + "num_samples": 100, + "max_time": 40 + }, + "pipeline": [ + { + "name": "extraction", + "function": "etl.extraction" + }, + { + "name": "training", + "function": "model.training" + }, + { + "name": "evaluation", + "function": "model.evaluation" + }, + { + "name": "deployment", + "function": "cloud.deploy" + } + ], + "function": { + "function1": { + "name": "transform", + "param": "col1" + }, + "function2": { + "name": "load", + "param": "col2" + } + } + } + new_config = {"name": "new"} + + # parser configs + dir_path = "sample-config" + config_path = { + "json": f"{dir_path}/sample.json", + "yml": f"{dir_path}/sample.yml" + } + config_folder = { + "json": f"{dir_path}/config-json", + "yml": f"{dir_path}/config-yml" + } + config_dict_raw = {} + + for ext, path in config_path.items(): + with open(path) as file: + config_raw = file.read() + config_dict_raw[ext] = config_raw + + @contextmanager + def write_tempfile(self, filename: str, config: str): + """Writes a given config to a filename under a tempdir, yield the filename.""" + tempdir = tempfile.TemporaryDirectory() + tempdirname = tempdir.name + filepath = os.path.join(tempdirname, filename) + with open(filepath, "w") as file: + file.write(str(config)) + try: + yield filepath + finally: + tempdir.cleanup() + + def test_write_tempfile(self): + """Function should write a config to a specific filename.""" + config = {"name": "config1"} + with self.write_tempfile( + filename="config.json", + config=config) as filepath: + print(filepath) + with open(filepath) as file: + self.assertEqual(file.read(), str(config), file.read()) + + def test_append_extension(self): + """Function should append appropriate file extensions.""" + + for parser in self.parsers: + parser = parser() + ext = parser.extension + # without a given file extension, function will append extension + self.assertEqual( + f"config.{ext}", parser._append_extension("config")) + # with given file extension, function does not append extension + self.assertEqual( + f"config.{ext}", parser._append_extension(f"config.{ext}")) + # given a extension that is not the parser extension, parser will + # still append the extension + self.assertEqual( + f"config.tmp.{ext}", parser._append_extension(f"config.tmp.{ext}")) + # raises assertion error when give non string input + self.assertRaises(AssertionError, parser._append_extension, 123) + + def test_load_method(self): + """Function should load different kind of config files.""" + + for parser in self.parsers: + parser = parser() + ext = parser.extension + # write a sample config + sample_config = self.config_dict_raw[ext] + with self.write_tempfile( + filename="config." + ext, + config=sample_config) as filename: + # load the config + config_loaded = parser._load_method(filename) + # yaml will load the config as ordered dictionary + config_loaded = dict(config_loaded) + self.assertEqual(config_loaded, self.config_truth, parser) + + def test_write_method(self): + """Function should be able to write different kind of config files.""" + + for parser in self.parsers: + # load sample config + ext = parser.extension + parser = parser(self.config_truth) + # write + with self.write_tempfile( + filename="config." + ext, + config="") as filename: + # write the config + parser._write_method(filename) + # rely on the implemented load method + loaded_config = parser._load_method(filename) + self.assertIsInstance(loaded_config, dict) + print(type(loaded_config)) + self.assertEqual(loaded_config, self.config_truth, parser) + + def test_load(self): + """Function should be able to load single config, a folder of configs + and a dictionary containing the config itself.""" + for parser in self.parsers: + parser = parser() + ext = parser.extension + # single config + config_path = self.config_path[ext] + loaded_config = parser.load(config_path).config + self.assertEqual(loaded_config, self.config_truth, parser) + + # folder of config + config_folder = self.config_folder[ext] + loaded_config = parser.load(config_folder).config + self.assertEqual(loaded_config, self.config_truth, parser) + + # dictionary containing the config + loaded_config = parser.load(self.config_truth).config + self.assertEqual(loaded_config, self.config_truth, parser) + + def test_write(self): + """Function should be able to write stored config to file, or a new + config to file.""" + for parser in self.parsers: + parser = parser(self.config_truth) + ext = parser.extension + # write stored config + with self.write_tempfile( + filename="config." + ext, + config="") as filename: + parser.write(filename) + loaded_config = parser._load_method(filename) + self.assertEqual(loaded_config, self.config_truth, parser) + + # write new config + with self.write_tempfile( + filename="config." + ext, + config="") as filename: + parser.write(filename, self.new_config) + loaded_config = parser._load_method(filename) + self.assertEqual(loaded_config, self.new_config, parser) + + def test_search_method(self): + """Function return True if regex match, else False.""" + parser = Parser() + ignored = ("pipeline.*",) + self.assertTrue(parser._search_match("pipeline.json", ignored=ignored)) + self.assertFalse(parser._search_match("nihao.yml", ignored=ignored)) + + ignored = ("^pipeline.*",) + self.assertTrue(parser._search_match("pipeline.json", ignored=ignored)) + self.assertFalse(parser._search_match("nihao.yml", ignored=ignored)) + + ignored = (".*json",) + self.assertTrue(parser._search_match("pipeline.json", ignored=ignored)) + self.assertFalse(parser._search_match("nihao.yml", ignored=ignored)) + + def test_ignore(self): + """Function should be able to ignore some config files.""" + config_truth = self.config_truth.copy() + config_truth.pop("pipeline") + for parser in self.parsers: + parser = parser() + ext = parser.extension + parser.load(config=self.config_folder[ext], ignored="pipeline.*") + loaded_config = dict(parser.config) + self.assertEqual(loaded_config, config_truth, parser) + + def test_convert(self): + """Function should be able to convert from one config file to + another.""" + + # test saving config from one parser against all the other parsers + for parser in self.parsers: + other_parsers = list(filter(lambda x: x != parser, self.parsers)) + parser = parser(self.new_config) + ext = parser.extension + for other_parser in other_parsers: + other_parser = other_parser() + other_ext = parser.extension + # save the stored config + with self.write_tempfile( + filename="config." + other_ext, + config="") as filename: + parser.convert(filename=filename, parser=other_parser) + loaded_config = other_parser._load_method(filename) + self.assertEqual(loaded_config, self.new_config, f"{parser}, {other_parser}") + + # save the config path + with self.write_tempfile( + filename="config." + other_ext, + config="") as filename: + config_path = self.config_path[ext] + parser.convert( + filename=filename, parser=other_parser, config_path=config_path) + loaded_config = other_parser._load_method(filename) + # yaml load config as ordered dict + # we convert it back to dict for comparison + loaded_config = dict(loaded_config) + self.assertEqual(loaded_config, self.config_truth, f"{parser}, {other_parser}") + + +if __name__ == "__main__": + unittest.main()