From 0586c344636e7c2e81ca3218353aa1a962c98aa0 Mon Sep 17 00:00:00 2001 From: Georg Osang Date: Sat, 2 Nov 2024 13:59:24 +0100 Subject: [PATCH 1/2] Refactor local source references --- src/parenttext_pipeline/configs.py | 3 ++ src/parenttext_pipeline/pull_data.py | 41 ++++++++++++++++------------ 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/parenttext_pipeline/configs.py b/src/parenttext_pipeline/configs.py index a764f3e..4c89105 100644 --- a/src/parenttext_pipeline/configs.py +++ b/src/parenttext_pipeline/configs.py @@ -143,6 +143,9 @@ class SheetsSourceConfig(SourceConfig): # Path or URL to a zip archive containing folders # each with sheets in CSV format (no nesting) files_archive: str = None + # Path relative to which other paths in the files_list/dict are, + # assuming no files_archive is provided + basepath: str = None @dataclass(kw_only=True) diff --git a/src/parenttext_pipeline/pull_data.py b/src/parenttext_pipeline/pull_data.py index 54947ec..78914df 100644 --- a/src/parenttext_pipeline/pull_data.py +++ b/src/parenttext_pipeline/pull_data.py @@ -83,6 +83,14 @@ def pull_translations(config, source, source_name): ) +def get_json_from_sheet_id(source, temp_dir, sheet_id): + if source.subformat == "google_sheets": + return convert_to_json(sheet_id, source.subformat) + else: + sheet_path = os.path.join(temp_dir, sheet_id) + return convert_to_json(sheet_path, source.subformat) + + def pull_sheets(config, source, source_name): # Download all sheets used for flow creation and edits and store as json source_input_path = get_input_subfolder( @@ -91,27 +99,26 @@ def pull_sheets(config, source, source_name): jsons = {} if source.files_archive is not None: - if source.subformat != "csv": - raise NotImplementedError( - "files_archive only supported for sheets of subformat csv." + if source.subformat == "google_sheets": + raise ValueError( + "files_archive not supported for sheets of subformat google_sheets." ) location = source.archive archive_filepath = download_archive(config.temppath, location) - with tempfile.TemporaryDirectory() as temp_dir: - shutil.unpack_archive(archive_filepath, temp_dir) - for sheet_id in source.files_list: - csv_folder = os.path.join(temp_dir, sheet_id) - jsons[sheet_id] = convert_to_json([csv_folder], source.subformat) + temp_dir = tempfile.TemporaryDirectory() + shutil.unpack_archive(archive_filepath, temp_dir) else: - for sheet_name in source.files_list: - if source.subformat != "google_sheets": - raise NotImplementedError( - "files_list only supported for sheets of subformat google_sheets." - ) - sheet_id = get_sheet_id(config, sheet_name) - jsons[sheet_name] = convert_to_json(sheet_id, source.subformat) - for new_name, sheet_id in source.files_dict.items(): - jsons[new_name] = convert_to_json(sheet_id, source.subformat) + temp_dir = Path(source.basepath or ".") + + for sheet_name in source.files_list: + sheet_id = get_sheet_id(config, sheet_name) + jsons[sheet_name] = get_json_from_sheet_id(source, temp_dir, sheet_id) + for new_name, sheet_name in source.files_dict.items(): + sheet_id = get_sheet_id(config, sheet_name) + jsons[new_name] = get_json_from_sheet_id(source, temp_dir, sheet_id) + + if source.files_archive is not None: + temp_dir.cleanup() for sheet_name, content in jsons.items(): with open(source_input_path / f"{sheet_name}.json", "w", encoding='utf-8') as export: From 0f9c6d80ed4597074d9e2b06b6565bc745a7d2d5 Mon Sep 17 00:00:00 2001 From: Georg Osang Date: Sat, 2 Nov 2024 14:48:21 +0100 Subject: [PATCH 2/2] Update documentation --- docs/sources.md | 82 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/docs/sources.md b/docs/sources.md index 2d94c8f..baead09 100644 --- a/docs/sources.md +++ b/docs/sources.md @@ -53,13 +53,13 @@ Example dictionary of files: } ``` -Sources may have both. +Sources may have both. In this case, dict entries will overwrite list entries of the same name. ## Local storage locations The source config fully determines the storage location of the data in its *storage format*. All data is stored inside of `{config.inputpath}`. When *pulling data*, each source gets its own local subfolder: For each source, a subfolder `{source.id}` is created. The list entries (str) and dict keys determine the filenames of the locally stored files. -Remark: For Google sheets, the sheet_ids are non-descript. Thus the [configuration] has an (optional) global field `sheet_names` in which a mapping from names to sheet_ids can be provided. When a source references an input file, it first looks up whether it's in the `sheet_names` map and in that case uses the respective values. +Remark: The [configuration] has an (optional) global field `sheet_names` in which a mapping from names to sheet_ids can be provided. When a source references an input file, it first looks up whether it's in the `sheet_names` map and in that case uses the respective key as storage file path (while pulling the file in accordance to the `sheet_names` dict value). This is useful for Google sheets because their sheet_ids are non-descript, but potentially also for local file references to abbreviate them and avoid `/`. ### `json` and `sheets` @@ -70,10 +70,84 @@ Within the source's subfolder, for each `(name, filepath)` entry in `{source.fil For the input format `sheets`, we can additionally use `files_list`. -- A special case here is if `files_archive` is provided and `source.subformat` is `csv`, then for each `sheet_id` entry in `source.files_list`, we process the folder `sheet_id` as a csv workbook and store the converted result as `{sheet_id}.json`. -- Otherwise, for each `sheet_id` entry in `source.files_list`, the processed version of `sheet_id` is stored as `{sheet_id}.json`. Note that this currently only works if `source.subformat` is `google_sheets`, because we have not made a decision on how to turn full file paths into filenames. +- For each `sheet_name` entry in `source.files_list`, the processed version of `sheet_name` is stored as `{sheet_name}.json`. Note that the `sheet_name` may not contain certain special characters such as `/`. +- If the subformat is not `google_sheets`, i.e. we're referencing local files, the local file path is relative to the current working directory of the pipeline. +- It is possible to provide a `basepath` (relative or absolute) to the source config; then all file paths are relative to the `basepath`. +- It is also possible to provide a `files_archive` URL to a zip file. In that case, all file paths are relative to the archive root. + - Remark: Do we still need `files_archive` (`.zip` archive) support? I'd be keen to deprecate it. +Example: Assume that, relative to the current working directory, we have a folder `csv/safeguarding` containing `.csv` files, and we have a file `excel_files/safeguarding crisis.xlsx`. Then the following stores three copies of the `csv` data and three copies of the `xlsx` data, each in json format. + +``` +{ + "meta": { + "version": "1.0.0", + "pipeline_version": "1.0.0" + }, + "parents": {}, + "flows_outputbasename": "parenttext_all", + "output_split_number": 1, + "sheet_names" : { + "csv_safeguarding" : "csv/safeguarding", + "xlsx_safeguarding" : "excel_files/safeguarding crisis.xlsx" + }, + "sources": { + "safeguarding_csv_dict": { + "parent_sources": [], + "format": "sheets", + "subformat": "csv", + "files_dict": { + "safeguarding": "csv/safeguarding" + } + }, + "safeguarding_csv_list": { + "parent_sources": [], + "format": "sheets", + "subformat": "csv", + "files_list": [ + "csv_safeguarding" + ] + }, + "safeguarding_csv_list_remap": { + "parent_sources": [], + "format": "sheets", + "subformat": "csv", + "basepath": "csv", + "files_list": [ + "safeguarding" + ] + }, + "safeguarding_xlsx_dict": { + "parent_sources": [], + "format": "sheets", + "subformat": "xlsx", + "files_dict": { + "safeguarding": "excel_files/safeguarding crisis.xlsx" + } + }, + "safeguarding_xlsx_list_remap": { + "parent_sources": [], + "format": "sheets", + "subformat": "xlsx", + "files_list": [ + "xlsx_safeguarding" + ] + }, + "safeguarding_xlsx_list": { + "parent_sources": [], + "basepath": "excel_files", + "format": "sheets", + "subformat": "xlsx", + "files_list": [ + "safeguarding crisis.xlsx" + ] + } + }, + "steps": [] +} +``` + [configs]: ../src/parenttext_pipeline/configs.py [configuration]: configuration.md [steps]: steps.md