From 0586c344636e7c2e81ca3218353aa1a962c98aa0 Mon Sep 17 00:00:00 2001
From: Georg Osang <geoo89@gmail.com>
Date: Sat, 2 Nov 2024 13:59:24 +0100
Subject: [PATCH 1/2] Refactor local source references

---
 src/parenttext_pipeline/configs.py   |  3 ++
 src/parenttext_pipeline/pull_data.py | 41 ++++++++++++++++------------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/parenttext_pipeline/configs.py b/src/parenttext_pipeline/configs.py
index a764f3e..4c89105 100644
--- a/src/parenttext_pipeline/configs.py
+++ b/src/parenttext_pipeline/configs.py
@@ -143,6 +143,9 @@ class SheetsSourceConfig(SourceConfig):
     # Path or URL to a zip archive containing folders
     # each with sheets in CSV format (no nesting)
     files_archive: str = None
+    # Path relative to which other paths in the files_list/dict are,
+    # assuming no files_archive is provided
+    basepath: str = None
 
 
 @dataclass(kw_only=True)
diff --git a/src/parenttext_pipeline/pull_data.py b/src/parenttext_pipeline/pull_data.py
index 54947ec..78914df 100644
--- a/src/parenttext_pipeline/pull_data.py
+++ b/src/parenttext_pipeline/pull_data.py
@@ -83,6 +83,14 @@ def pull_translations(config, source, source_name):
                 )
 
 
+def get_json_from_sheet_id(source, temp_dir, sheet_id):
+    if source.subformat == "google_sheets":
+        return convert_to_json(sheet_id, source.subformat)
+    else:
+        sheet_path = os.path.join(temp_dir, sheet_id)
+        return convert_to_json(sheet_path, source.subformat)
+
+
 def pull_sheets(config, source, source_name):
     # Download all sheets used for flow creation and edits and store as json
     source_input_path = get_input_subfolder(
@@ -91,27 +99,26 @@ def pull_sheets(config, source, source_name):
 
     jsons = {}
     if source.files_archive is not None:
-        if source.subformat != "csv":
-            raise NotImplementedError(
-                "files_archive only supported for sheets of subformat csv."
+        if source.subformat == "google_sheets":
+            raise ValueError(
+                "files_archive not supported for sheets of subformat google_sheets."
             )
         location = source.archive
         archive_filepath = download_archive(config.temppath, location)
-        with tempfile.TemporaryDirectory() as temp_dir:
-            shutil.unpack_archive(archive_filepath, temp_dir)
-            for sheet_id in source.files_list:
-                csv_folder = os.path.join(temp_dir, sheet_id)
-                jsons[sheet_id] = convert_to_json([csv_folder], source.subformat)
+        temp_dir = tempfile.TemporaryDirectory()
+        shutil.unpack_archive(archive_filepath, temp_dir)
     else:
-        for sheet_name in source.files_list:
-            if source.subformat != "google_sheets":
-                raise NotImplementedError(
-                    "files_list only supported for sheets of subformat google_sheets."
-                )
-            sheet_id = get_sheet_id(config, sheet_name)
-            jsons[sheet_name] = convert_to_json(sheet_id, source.subformat)
-    for new_name, sheet_id in source.files_dict.items():
-        jsons[new_name] = convert_to_json(sheet_id, source.subformat)
+        temp_dir = Path(source.basepath or ".")
+
+    for sheet_name in source.files_list:
+        sheet_id = get_sheet_id(config, sheet_name)
+        jsons[sheet_name] = get_json_from_sheet_id(source, temp_dir, sheet_id)
+    for new_name, sheet_name in source.files_dict.items():
+        sheet_id = get_sheet_id(config, sheet_name)
+        jsons[new_name] = get_json_from_sheet_id(source, temp_dir, sheet_id)
+
+    if source.files_archive is not None:
+        temp_dir.cleanup()
 
     for sheet_name, content in jsons.items():
         with open(source_input_path / f"{sheet_name}.json", "w", encoding='utf-8') as export:

From 0f9c6d80ed4597074d9e2b06b6565bc745a7d2d5 Mon Sep 17 00:00:00 2001
From: Georg Osang <geoo89@gmail.com>
Date: Sat, 2 Nov 2024 14:48:21 +0100
Subject: [PATCH 2/2] Update documentation

---
 docs/sources.md | 82 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 4 deletions(-)

diff --git a/docs/sources.md b/docs/sources.md
index 2d94c8f..baead09 100644
--- a/docs/sources.md
+++ b/docs/sources.md
@@ -53,13 +53,13 @@ Example dictionary of files:
 }
 ```
 
-Sources may have both.
+Sources may have both. In this case, dict entries will overwrite list entries of the same name.
 
 ## Local storage locations
 
 The source config fully determines the storage location of the data in its *storage format*. All data is stored inside of `{config.inputpath}`. When *pulling data*, each source gets its own local subfolder: For each source, a subfolder `{source.id}` is created. The list entries (str) and dict keys determine the filenames of the locally stored files.
 
-Remark: For Google sheets, the sheet_ids are non-descript. Thus the [configuration] has an (optional) global field `sheet_names` in which a mapping from names to sheet_ids can be provided. When a source references an input file, it first looks up whether it's in the `sheet_names` map and in that case uses the respective values.
+Remark: The [configuration] has an (optional) global field `sheet_names` in which a mapping from names to sheet_ids can be provided. When a source references an input file, it first looks up whether it's in the `sheet_names` map and in that case uses the respective key as storage file path (while pulling the file in accordance to the `sheet_names` dict value). This is useful for Google sheets because their sheet_ids are non-descript, but potentially also for local file references to abbreviate them and avoid `/`.
 
 
 ### `json` and `sheets`
@@ -70,10 +70,84 @@ Within the source's subfolder, for each `(name, filepath)` entry in `{source.fil
 
 For the input format `sheets`, we can additionally use `files_list`.
 
-- A special case here is if `files_archive` is provided and `source.subformat` is `csv`, then for each `sheet_id` entry in `source.files_list`, we process the folder `sheet_id` as a csv workbook and store the converted result as `{sheet_id}.json`. 
-- Otherwise, for each `sheet_id` entry in `source.files_list`, the processed version of `sheet_id` is stored as `{sheet_id}.json`. Note that this currently only works if `source.subformat` is `google_sheets`, because we have not made a decision on how to turn full file paths into filenames. 
+- For each `sheet_name` entry in `source.files_list`, the processed version of `sheet_name` is stored as `{sheet_name}.json`. Note that the `sheet_name` may not contain certain special characters such as `/`.
+- If the subformat is not `google_sheets`, i.e. we're referencing local files, the local file path is relative to the current working directory of the pipeline.
+- It is possible to provide a `basepath` (relative or absolute) to the source config; then all file paths are relative to the `basepath`.
+- It is also possible to provide a `files_archive` URL to a zip file. In that case, all file paths are relative to the archive root.
+
 - Remark: Do we still need `files_archive` (`.zip` archive) support? I'd be keen to deprecate it.
 
+Example: Assume that, relative to the current working directory, we have a folder `csv/safeguarding` containing `.csv` files, and we have a file `excel_files/safeguarding crisis.xlsx`. Then the following stores three copies of the `csv` data and three copies of the `xlsx` data, each in json format.
+
+```
+{
+    "meta": {
+        "version": "1.0.0",
+        "pipeline_version": "1.0.0"
+    },
+    "parents": {},
+    "flows_outputbasename": "parenttext_all",
+    "output_split_number": 1,
+    "sheet_names" : {
+        "csv_safeguarding" : "csv/safeguarding",
+        "xlsx_safeguarding" : "excel_files/safeguarding crisis.xlsx"
+    },
+    "sources": {
+        "safeguarding_csv_dict": {
+            "parent_sources": [],
+            "format": "sheets",
+            "subformat": "csv",
+            "files_dict": {
+                "safeguarding": "csv/safeguarding"
+            }
+        },
+        "safeguarding_csv_list": {
+            "parent_sources": [],
+            "format": "sheets",
+            "subformat": "csv",
+            "files_list": [
+                "csv_safeguarding"
+            ]
+        },
+        "safeguarding_csv_list_remap": {
+            "parent_sources": [],
+            "format": "sheets",
+            "subformat": "csv",
+            "basepath": "csv",
+            "files_list": [
+                "safeguarding"
+            ]
+        },
+        "safeguarding_xlsx_dict": {
+            "parent_sources": [],
+            "format": "sheets",
+            "subformat": "xlsx",
+            "files_dict": {
+                "safeguarding": "excel_files/safeguarding crisis.xlsx"
+            }
+        },
+        "safeguarding_xlsx_list_remap": {
+            "parent_sources": [],
+            "format": "sheets",
+            "subformat": "xlsx",
+            "files_list": [
+                "xlsx_safeguarding"
+            ]
+        },
+        "safeguarding_xlsx_list": {
+            "parent_sources": [],
+            "basepath": "excel_files",
+            "format": "sheets",
+            "subformat": "xlsx",
+            "files_list": [
+                "safeguarding crisis.xlsx"
+            ]
+        }
+    },
+    "steps": []
+}
+```
+
 [configs]: ../src/parenttext_pipeline/configs.py
 [configuration]: configuration.md
 [steps]: steps.md