Merge pull request #129 from IDEMSInternational/feat/inheritance

Add support for inheritance
IDEMSInternational · Jun 26, 2024 · 23e4a69 · 23e4a69
2 parents ec8a86a + 40d6375
commit 23e4a69
Show file tree

Hide file tree

Showing 21 changed files with 1,470 additions and 3,201 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,2 +1,3 @@
 [flake8]
 max-line-length = 88
+extend-ignore = E203
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# Apply style guidelines
+0294ee15e9f6f0d9e18134370703acb845bdb0cd
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 # Intermediary output files
 temp/
-output/
+src/parenttext_pipeline/_version.py
 
 #credentials files
 credentials.json

diff --git a/README.md b/README.md
@@ -11,10 +11,15 @@ Handles the process for producing RapidPro flows from data held in spreadsheets.
 
 # Run
 
-To start the pipeline:
+Two [operations] are currently available:
+
+- `pull_data`: Read data from various sources and store them locally in json format.
+- `create_flows`: Compile RapidPro flows from locally stored json files
+
+To start the pipeline performing both operations in sequence:
 
 ```
-python -m parenttext_pipeline.cli
+python -m parenttext_pipeline.cli pull_data compile_flows
 ```
 
 You will need to create a file called 'config.py', in the current working directory, and define a callable called 'create_config' that returns the pipeline settings as a dict. More details can be in the [configuration page][config].
@@ -27,6 +32,7 @@ You will need to create a file called 'config.py', in the current working direct
 - [Transcode tool] - to prepare video and audio files that may be used by ParentText applications
 
 
+[operations]: docs/operations.md
 [config]: docs/configuration.md
 [Archive tool]: docs/archive.md
 [RapidPro flow importer]: docs/rapidpro-import.md

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -15,160 +15,26 @@ The `create_config` callable must return a `dict` of configuration settings.
 
 # Available settings
 
-## sources
+The main features of the config are a list of steps of the pipeline, and a list of sources to pull data from.
+Steps are executed in order, the first step producing a temporary flow output file, and subsequent steps generally operating on the output of the previous step and most of the time (but not always) producing a new (temporary) flow output files. Some steps may also produce different output artefacts than flows, such as a list of translatable strings, or logs or reports for QA. Subsequent steps cannot read such outputs, however. For more details about steps, see [steps].
+There are different types of steps, and some types of steps may need additional input data that is used to create or operate on the input flows. Such data is defined in data sources, which may reference local files or files off the internet, in various formats. Steps then may reference one or multiple such data sources. For more details about steps, see [sources].
 
-### sources.filename
+The *pull_data* operation takes data referenced by all sources and saves it in the local file system (folder `{inputpath}`) converted to json. It is agnostic of the actual steps.
 
-The name prefix that will be used in filenames during processing.
+The *compile_flows* operation executes the sequence of steps and writes the output to  `{flows_outputbasename}.json` in `{outputpath}`.
 
-### sources.spreadsheet\_ids
+The config has the following fields:
 
-IDs of Google Sheets where the ParentText flows are defined.
+- `meta`: meta information such as the pipeline version the config needs to be run with
+- `inputpath`, `temppath` and `outputpath` (optional): Path to store/read input files, temp files, and output files.
+- `flows_outputbasename`: Base filename of the output file and intermediate temp files.
+- `output_split_number` (optional): Number of files to split the pipeline output (final flow definition) into.
+    - Used to divide the file at the final step to get it to a manageable size that can be uploaded to RapidPro.
+- `steps`: A list of steps. For more details, see [steps]
+- `sources`: A dictionary of data sources. For more details, see [sources]
+- `sheet_names`: A dictionary of from sheet names to sheet_ids (**for Google sheets only**). 
+   Sources can reference sheets by their ID or their sheet names.
+- `parents`: **Not Implemented** One (or multiple?) parent repos whose sources can be referenced
 
-### sources.crowdin\_name
-
-Name of the file that is produced to send to translators.
-
-### sources.tags
-
-Used to identify flows to be process. Possible values for tag 1:
-
-- onboarding
-- dev\_assess
-- ltp_activity
-- home\_activity\_checkin
-- module
-- goal\_checkin
-- safeguarding
-- menu
-- delivery
-
-### sources.split\_no
-
-The number of files into which the final flow definition will be split.
-
-Used to divide the file at the final step to get it to a manageable size that can be uploaded to RapidPro.
-
-## special\_expiration
-
-Used to modify expiration times.
-
-## default\_expiration
-
-Used to modify expiration times.
-
-## model
-
-Name of the Python module containing data models to use as part of the process of converting data extracted from sheets.
-
-## languages
-
-A list of language definitions that will be looked for to localize back into the flows. Each language definition consists of:
-
-- `language`: 3-letter language code used in RapidPro
-- `code`: 2-letter code used in CrowdIn
-
-## translation\_repo
-
-Location of a git repository where translations are stored.
-
-## folder\_within\_repo
-
-The location within `tranlsation_repo` where translations are stored.
-
-Used in conjuction with `translation_repo`, above.
-
-## outputpath
-
-Destination path for all files (including intermediary files and log files).
-
-Default is 'output' within the current workin directory.
-
-## qr\_treatment
-
-How to process "quick replies". Valid values are:
-
-- move: Remove quick replies and add equivalents to them to the message text, and give numerical prompts to allow basic phone users to use the app.
-- move_and_mod: As above but has additional functionality allowing you to replace phrases
-- reformat: Reformat quick replies so that long ones are added to the message text, as above.
-- reformat_whatsapp: Reformat quick replies to meet the whatsapp format
-- reformat_china: Reformat quick replies to the standard as requested by China
-- wechat: All quick replies moved to links in message text as can be used in WeChat
-- none: Do nothing.
-
-## select\_phrases
-
-The default phrase we want to add if quick replies are being moved to message text.
-
-## add\_selectors
-
-If `qr_treatment` is 'move', add some basic numerical quick replies back in. Valid values are 'yes' or 'no'.
-
-## special\_words
-
-Path to a file containing words we always want to keep as full quick replies.
-
-## count\_threshold
-
-When `qr_treatment` is 'reformat', set limits on the number of quick replies that are processed.
-
-If the number of quick replies is below or equal to count\_threshold then the quick replies are left in place.
-
-## length\_threshold
-
-When `qr_treatment` is 'reformat', set limits on the number of quick replies that are processed.
-
-If the character-length of the longest quick reply is below or equal to length\_threshold then the quick replies are left in place.
-
-## ab\_testing\_sheet\_id
-
-Google Sheets ID for Sheet containing AB testing data.
-
-## localisation\_sheet\_id
-
-Google Sheets ID.
-
-## eng\_edits\_sheet\_id
-
-Google Sheets ID for Sheet containing dict edits data.
-
-## transl\_edits\_sheet\_id
-
-Google Sheets ID.
-
-## sg\_flow\_id
-
-Sheets ID for Sheet containing safeguarding data.
-
-## sg\_flow\_name
-
-The name of the RapidPro flow for safeguarding.
-
-## sg\_path
-
-Path to file containing translated safeguarding words in JSON format.
-
-## sg\_sources
-
-Defines a list of sources containing safeguarding keywords. Each entry is a `dict` containing the following keys:
-
-- `key`: three letter language code of the translated words
-- `path`: file path on the local file system to the XLSX file containing the words
-
-For example:
-```python
-{
-    "sg_sources": [
-        {
-            "key": "spa",
-            "path": "excel_files/safeguarding mexico.xlsx",
-        },
-    ],
-}
-```
-
-The referenced XLSX files will be converted to a single file called _safeguarding\_words.json_, in the output directory. The `sg_path` setting will be overridden to point to this JSON file, for further processing. If `sg_sources` is not set, `sg_path` will remain unchanged.
-
-## redirect\_flow\_names
-
-Names of redirect flows to be modified as part of safeguarding process.
+[sources]: sources.md
+[steps]: steps.md
diff --git a/docs/operations.md b/docs/operations.md
@@ -0,0 +1,30 @@
+# Overview
+
+The pipeline tool supports different operations. To run the pipeline performing a sequence of operations:
+
+```
+python -m parenttext_pipeline.cli operation1 operation2 ...
+```
+
+In order to run a pipeline, you must have a configuration file, see [configuration page][config] for more details.
+
+Two operations are currently available:
+
+## `pull_data`
+
+Read data from various sources (which are defined in the config) and store them locally in json format.
+The data will be written to the input folder specified in the config.
+Different input formats are supported, and the data for each source is written to its own subfolder, see [sources].
+
+The purpose of this is to a ensure that `compile_flows` runs of the pipeline are reproducable, by essentially freezing the state of all input spreadsheets at a point in time. It attempts to avoid the potential problem of Google Sheets being updated incorrectly and causing a pipeline run to fail. The `compile_flows` pipeline will only read locally stored data that has been pulled beforehand.
+
+
+## `compile_flows`
+
+Compile RapidPro flows from locally stored json files that have been pulled using `pull_data`.
+Compiling flows involves multiple processing steps that are defined in the config, see [steps].
+
+
+[config]: configuration.md
+[steps]: steps.md
+[sources]: sources.md
diff --git a/docs/sources.md b/docs/sources.md
@@ -0,0 +1,38 @@
+# Sources
+
+Sources represent references to input data that may be used by [steps] of the pipeline, in various possible *source formats*. 
+
+- `sheets`: Model-agnostic spreadsheet workbooks (a *spreadsheet* or *workbook* is a collection of individual *sheets*).
+    - These may be in any of the following *subformats*:
+        - `google_sheets`: Reference to a Google spreadsheet
+        - `xlsx`: Reference to an XLSX file
+        - `csv`: Reference to a folder of csv files representing the workbook.
+        - `json`: Reference of a workbook in JSON format.
+    - Each input file is converted into JSON workbook format; the resulting files a flatly stored in the output folder. In case of a name clash, a later file will overwrite an earlier file. (Processing order is `files_list` > `files_dict`)
+- `json`: JSON files.
+    - These are taken as is and copied to their new storage location.
+    - Currently, only local file paths are supported.
+- `translation_repo`: a format specifically for the translation step, see `TranslationSourceConfig` in [configs].
+- `safeguarding`: a format specifically for the safeguarding step (to be deprecated), see `SafeguardingSourceConfig` in [configs].
+- Remark: We may introduce a model-specific spreadsheet format with a master sheet indicating the model underlying each sheet in the future, so that the data can be validated and stored in a json format representing the (possibly nested) model.
+
+Such data can be *pulled* to convert it into a github-friendly *storage format* (i.e. plaintext json) and store it locally. Once stored locally, such data can be used as input to individual steps of the *flow compilation* pipeline. The storage format is (so far) always json, and the exact structure of the json is domain specific, i.e. the user has to make sure that the data presented is in a format suitable for a specific pipeline step. In particular, it may be possible to represent input data in different *source formats* that yield the same data in the *storage format*.
+
+## File referencing
+
+The source config fully determines the storage location of the data in its *storage format*. All data is stored inside of `{config.inputpath}`. For each source, a subfolder `{source.id}` is created. 
+
+### `json` and `sheets`
+
+Within the source's subfolder, for each `(name, filepath)` entry in `{source.files_dict}`, the processed version of `{filepath}` is stored as `{name}.json`.
+
+### `sheets` only
+
+For the input format `sheets`, we can additionally use `files_list`.
+
+- A special case here is if `files_archive` is provided and `source.subformat` is `csv`, then for each `sheet_id` entry in `source.files_list`, we process the folder `sheet_id` as a csv workbook and store the converted result as `{sheet_id}.json`. 
+- Otherwise, for each `sheet_id` entry in `source.files_list`, the processed version of `sheet_id` is stored as `{sheet_id}.json`. Note that this currently only works if `source.subformat` is `google_sheets`, because we have not made a decision on how to turn full file paths into filenames. 
+- Remark: Do we still need `files_archive` (`.zip` archive) support? I'd be keen to deprecate it.
+
+[configs]: ../src/parenttext_pipeline/configs.py
+[steps]: steps.md
diff --git a/docs/steps.md b/docs/steps.md
@@ -0,0 +1,54 @@
+# Pipeline steps
+
+Each step has an identifier (name), a type, and (depending on the type) may have a list of [sources] referencing input files relevant for the step.
+Depending on the type, the config of each step may have various additional fields, see [configs] for details. The idenfier (name) and no further purpose and only serves for reporting and affects temp file names. Input file locations are determined by the sources.
+
+We have the following types of steps.
+
+- `create_flows`: Create flows from sheets (using `rpft create_flows`)
+    - source(s): type `sheets`, the input sheets to create the flows from
+    - `models_module`: Name of the Python module containing data models to use as part of the process of converting data extracted from sheets.
+    - `tags`: Tags to pass to `rpft create_flows`. Used to identify flows to be processed. Possible values for tag 1:
+        - `onboarding`
+        - `dev_assess`
+        - `ltp_activity`
+        - `home_activity_checkin`
+        - `module`
+        - `goal_checkin`
+        - `safeguarding`
+        - `menu`
+        - `delivery`
+- `load_flows`: Load flows directly from json.
+    - source(s): type `json`, the source must reference exactly one input RapidPro json file (that the following steps operate on)
+- `edits`: Apply edits and/or A/B-Testing to input flows (using repo `rapidpro_abtesting`)
+    - source(s): type `sheets`, the sheets defining the edits to do on the flows
+- `extract_texts_for_translators`: Extract text from flows and produce a `.pot` file for translation.
+    - `crowdin_name`: base name of the output files
+- `fix_arg_qr_translation`: ???
+- `has_any_word_check`: ???
+- `overall_integrity_check`: ???
+- `qr_treatment`: ...
+    - source: type `json`, the source's `files_dict` must have an entry `select_phrases_file` and `special_words_file`
+    - see `QRTreatmentStepConfig` in [configs]
+- `safeguarding`: ...
+    - source(s): type `safeguarding`, files to read safeguarding data from
+    - see `SafeguardingStepConfig` in [configs]
+- `translation`: Generate translated flows
+    - source(s): type `translation_repo`, repo to read translated strings from
+    - `languages`: List of languages to translate the flows into. Each language is a dict with two keys:
+        - `language` is the 3-letter code used in RapidPro
+        - `code` is the 2 letter code used in CrowdIn
+    languages: list[dict]
+- `update_expiration_times`: Update expiration times of flows (using default value and an option file defining flow-specific values)
+    - source (optional): type `json`, the source's `files_dict` must have an entry `special_expiration_file` defining a map from flow names to expiration times
+    - `default_expiration_time`: expiration time to apply to all flows that are not referenced in `special_expiration_file`
+
+The first step of the pipeline must be `create_flows` or `load_flows`. These two steps do not take any input, and thus they also only make sense as a first step.
+
+### Remarks
+
+We want to have the functionality to pull Goals API data from a spreadsheet and store it locally, so it can be read by the API directly from github.
+This does not require a step, but can be implemented by only specifying a `goals_api` source which is not referenced by any step.
+
+[configs]: ../src/parenttext_pipeline/configs.py
+[sources]: sources.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,15 @@
 [build-system]
-requires = ["setuptools"]
+requires = ["setuptools", "setuptools-scm"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "parenttext_pipeline"
-version = "0.2.2"
+dynamic = ["version"]
 authors = [
     {name = "IDEMS International", email = "[email protected]"},
 ]
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 license = {text = "GPL-3.0-or-later"}
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -20,19 +20,21 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Topic :: Text Processing :: General",
     "Topic :: Utilities",
 ]
 dependencies = [
     "beautifulsoup4~=4.12",
-    "rapidpro-abtesting@https://github.com/IDEMSInternational/rapidpro_abtesting/archive/refs/tags/0.1.1.tar.gz",
+    "packaging~=21.3",
+    "rapidpro-abtesting@https://github.com/IDEMSInternational/rapidpro_abtesting/archive/refs/tags/0.1.2.tar.gz",
     "requests~=2.31",
-    "rpft@https://github.com/IDEMSInternational/rapidpro-flow-toolkit/archive/refs/tags/1.1.3.tar.gz",
+    "rpft@https://github.com/IDEMSInternational/rapidpro-flow-toolkit/archive/refs/tags/1.2.1.tar.gz",
 ]
 
 [project.scripts]
 rpimport = "parenttext_pipeline.importer:cli"
+
+[tool.setuptools_scm]
+version_file = "src/parenttext_pipeline/_version.py"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Apply style guidelines
		0294ee15e9f6f0d9e18134370703acb845bdb0cd