Merge pull request #29 from sb-ncbr/version_rsync_logs

Version rsync logs
sb-ncbr · Jun 1, 2024 · 637680a · 637680a
2 parents 20786d2 + 0904eda
commit 637680a
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 17 deletions.
diff --git a/src/config.py b/src/config.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
-from src.utils import get_formatted_date, int_from_env, bool_from_env, int_list_from_env
+from src.utils import get_formatted_date, int_from_env, bool_from_env, int_list_from_env, get_formatted_timestamp
 
 
 @dataclass(slots=True)
@@ -143,6 +143,7 @@ class FilepathConfig:
     # logs
     _full_log_name: str = env.get("FULL_LOG_NAME", "full_log.txt")
     _filtered_log_name: str = env.get("FILTERED_LOG_NAME", "filtered_log.txt")
+    _run_start_timesetamp: str = get_formatted_timestamp()
 
     @property
     def rest_jsons(self) -> str:
@@ -212,6 +213,14 @@ def key_trends_versions_json(self) -> str:
     def old_crunched_csv(self) -> str:
         return path.join(self.output_root_path, self._old_crunched_csv_name)
 
+    @property
+    def archive_rsync_logs(self) -> str:
+        return path.join(self.logs_root_path, "rsync_log_history")
+
+    @property
+    def archive_app_logs(self) -> str:
+        return path.join(self.logs_root_path, "full_log_history")
+
     @property
     def full_log(self) -> str:
         return path.join(self.logs_root_path, self._full_log_name)
@@ -220,6 +229,18 @@ def full_log(self) -> str:
     def filtered_log(self) -> str:
         return path.join(self.logs_root_path, self._filtered_log_name)
 
+    @property
+    def archive_full_log(self) -> str:
+        return path.join(self.archive_app_logs, f"{self._run_start_timesetamp}_{self._full_log_name}")
+
+    @property
+    def mmcif_rsync_log(self) -> str:
+        return path.join(self.archive_rsync_logs, f"{self._run_start_timesetamp}_mmcif_rsync_log.txt")
+
+    @property
+    def xml_rsync_log(self) -> str:
+        return path.join(self.archive_rsync_logs, f"{self._run_start_timesetamp}_xml_rsync_log.txt")
+
 
 @dataclass(slots=True)
 class DownloadTimeoutConfig:

diff --git a/src/data_download/data_download_manager.py b/src/data_download/data_download_manager.py
@@ -54,6 +54,7 @@ def sync_pdbe_mmcif_via_rsync(config: Config) -> Optional[ChangedIds]:
                 RsyncDataType.ARCHIVE_MMCIF,
                 config.filepaths.gz_pdb_mmcifs,
                 config.filepaths.pdb_mmcifs,
+                config.filepaths.mmcif_rsync_log
             )
             changed_ids = ChangedIds(
                 updated=rsync_log.get_successful_recieved_ids(),
@@ -198,6 +199,7 @@ def rsync_xml_validation_files(config: Config) -> list[str]:
                 RsyncDataType.XML_VALIDATION_REPORTS,
                 config.filepaths.gz_xml_reports,
                 config.filepaths.xml_reports,
+                config.filepaths.xml_rsync_log
             )
             logging.info("Rsync of validation xml files finished successfully.")
             return rsync_log.get_successful_recieved_ids()

diff --git a/src/data_download/rsync_handler.py b/src/data_download/rsync_handler.py
@@ -7,7 +7,8 @@
 from enum import Enum
 from typing import Optional
 
-from src.exception import DataDownloadError
+from src.exception import DataDownloadError, ParsingError
+from src.generic_file_handlers.plain_file_handler import load_file
 
 
 @dataclass(slots=True)
@@ -64,21 +65,36 @@ class RsyncDataType(Enum):
     XML_VALIDATION_REPORTS = 1
 
 
-def rsync_and_unzip(rsync_data_type: RsyncDataType, gzip_folder: str, unpacked_folder: str) -> RsyncLog:
+def rsync_and_unzip(
+    rsync_data_type: RsyncDataType, gzip_folder: str, unpacked_folder: str, log_location: str
+) -> RsyncLog:
     """
     Assembles rsync command based on rsync data type. Runs the command. Parses the rsync log to get recieved
     and deleted files entries. Unzips the gz from the files for recieved files, removes the removed from unzipped.
     :param rsync_data_type: Type of data to rsync. Rsync command is assembled based on it, and logs are parsed
     differently for each type (based on file extensions associated with the type).
     :param gzip_folder: Path to folder where .gz versions are stored. This is the rsync target.
     :param unpacked_folder: Folder where unpacked files are stored.
+    :param log_location: Path of the log to be created and used.
     :return: Parsed rsync log.
     """
     rsync_command = _assemble_rsync_command(rsync_data_type, gzip_folder)
     logging.info("Running rsync command: '%s'", " ".join(rsync_command))
-    rsync_raw_log = _run_rsync_command(rsync_command)
+    _run_rsync_command(rsync_command, log_location)
     logging.info("Rsync of gzip files done. Next: parsing log.")
 
+    try:
+        rsync_raw_log = load_file(log_location)
+    except ParsingError as ex:
+        logging.critical(
+            "Failed to open rsync log %s. The changes to the files mentioned inside are not "
+            "processed. You need to see the log for recieved files, and either delete those gip files and"
+            "run the data download again unchanged, or unzip them yourself and then run the data download again"
+            "with the changed ids passed as OVERRIDE_IDS_TO_DOWNLOAD_PATH.",
+            log_location
+        )
+        raise DataDownloadError(f"Failed to open rsync log {log_location}. {ex}") from ex
+
     if rsync_data_type == RsyncDataType.ARCHIVE_MMCIF:
         rsync_log = _parse_rsync_log_for_mmcif(rsync_raw_log)
     elif rsync_data_type == RsyncDataType.XML_VALIDATION_REPORTS:
@@ -124,22 +140,23 @@ def _assemble_rsync_command(rsync_data_type: RsyncDataType, target_folder_path:
         raise DataDownloadError(f"Unsupported rsync type {rsync_data_type}.")
 
     command.append(target_folder_path)
+
     return command
 
 
-def _run_rsync_command(rsync_command: list[str]) -> str:
+def _run_rsync_command(rsync_command: list[str], log_location: str) -> None:
     try:
-        completed_process = subprocess.run(rsync_command, check=True, capture_output=True)
-        return completed_process.stdout.decode("utf8", errors="ignore")
+        with open(log_location, "w", encoding="utf8") as f:
+            subprocess.run(rsync_command, check=True, stdout=f)
     except subprocess.CalledProcessError as ex:
-        logging.error(
-            "Called subproccess error:\n%s\n",
-            ex.stderr.decode("utf8", errors="ignore").strip(),
-        )
+        logging.error("Called subproccess error:\n%s\n", ex)
         logging.critical(
-            "Check the following log. If any items were recieved, manual actions are needed (unzipping the recieved "
-            "files; and running download with overriden ids to download for these in case of mmcifs).\n%s\n",
-            ex.stdout.decode("utf8", errors="ignore").strip(),
+            "Failed to finish rsync command. See newest rsync log to see files that were already synced "
+            "(file %s). The changes to the files mentioned inside are not "
+            "processed. You need to see the log for recieved files, and either delete those gip files and"
+            "run the data download again unchanged, or unzip them yourself and then run the data download again"
+            "with the changed ids passed as OVERRIDE_IDS_TO_DOWNLOAD_PATH.",
+            log_location
         )
         raise DataDownloadError(f"Rsync failed: {ex}") from ex
 

diff --git a/src/generic_file_handlers/plain_file_handler.py b/src/generic_file_handlers/plain_file_handler.py
@@ -1,6 +1,6 @@
 import logging
 
-from src.exception import FileWritingError
+from src.exception import FileWritingError, ParsingError
 
 
 def write_file(filepath: str, content: str) -> None:
@@ -16,3 +16,16 @@ def write_file(filepath: str, content: str) -> None:
             logging.info("Saved file %s", filepath)
     except OSError as ex:
         raise FileWritingError(f"Failed to write file {filepath}: {ex}") from ex
+
+
+def load_file(filepath: str) -> str:
+    """
+    Load content from given filepath.
+    :param filepath:
+    :return: String representation of file content.
+    """
+    try:
+        with open(filepath, "r", encoding="utf8") as f:
+            return f.read()
+    except OSError as ex:
+        raise ParsingError(f"Failed to read file {filepath}: {ex}") from ex
diff --git a/src/main.py b/src/main.py
@@ -23,6 +23,11 @@ def prepare_log_folder(config: Config) -> None:
         if os.path.exists(config.filepaths.full_log):
             os.remove(config.filepaths.full_log)
 
+    if not os.path.exists(config.filepaths.archive_rsync_logs):
+        os.mkdir(config.filepaths.archive_rsync_logs)
+    if not os.path.exists(config.filepaths.archive_app_logs):
+        os.mkdir(config.filepaths.archive_app_logs)
+
 
 def configure_logging(config: Config) -> None:
     """
@@ -38,6 +43,10 @@ def configure_logging(config: Config) -> None:
     full_log_file_handler = logging.FileHandler(config.filepaths.full_log)
     full_log_file_handler.setFormatter(formatter)
 
+    # logging to a timestamped file archive in full
+    archive_log_file_handler = logging.FileHandler(config.filepaths.archive_full_log)
+    archive_log_file_handler.setFormatter(formatter)
+
     # logging to file filtering info/debug out
     filtered_log_file_handler = logging.FileHandler(config.filepaths.filtered_log)
     filtered_log_file_handler.setFormatter(formatter)
@@ -48,6 +57,7 @@ def configure_logging(config: Config) -> None:
     stream_handler.setFormatter(formatter)
 
     root_logger.addHandler(full_log_file_handler)
+    root_logger.addHandler(archive_log_file_handler)
     root_logger.addHandler(filtered_log_file_handler)
     root_logger.addHandler(stream_handler)
 

diff --git a/src/utils.py b/src/utils.py
@@ -223,6 +223,14 @@ def get_formatted_date() -> str:
     return datetime.now().strftime("%Y%m%d")
 
 
+def get_formatted_timestamp() -> str:
+    """
+    Get current time in the format %Y%m%d_%H%M%S.
+    :return: The timestamp string
+    """
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+
+
 def find_matching_files(folder_path: str, string_to_match: str) -> list[str]:
     """
     Find files in given folder path that contain given string exactly.

diff --git a/user_guide.md b/user_guide.md
@@ -1,5 +1,9 @@
 # User guide
 
+If at any point the aplication fails, check the `logs/` folder and its subfolders for information about
+what happened, if anything needs to be done and how to recover. Take special care if a `CRITICAL` log level
+is ever encountered, as that requires manual actions to avoid losing information.
+
 1. [Package management](#package-management)
 2. [Local development](#local-development)
    1. [Docker](#docker)
@@ -363,7 +367,7 @@ Data download is done as follows:
 
    First, files in location `./dataset/gz_PDBe_mmCIF/` are synced to the rsync endpoint `rsync.rcsb.org::ftp_data/structures/divided/mmCIF/`. These files have `.cif.gz` extensions.
 
-   The output of this operation is parsed to see which files were received or removed, and what are their PDB ids are.
+   The output of this operation is parsed to see which files were received or removed, and what are their PDB ids are. The log is also saved into its log subfolder.
 
    Based on the rsync log, received files are unzipped into plain `.cif` to folder `./dataset/PDBe_mmCIF/`. Those deleted are deleted from there as well. The list of ids that changed or were removed is saved for further processing.
 
@@ -413,7 +417,7 @@ In cases not mentioned here, follow the WARNING and ERROR levels of logs (stored
 
 - **App fails during the rsync of structure mmCIF files and no other actions are done.**
 
-  1. Check the logs for raw rsync log of the part that still managed to run. If there are any files that were still received, action needs to be taken.
+  1. Check the `logs/rsync_log_history/` for raw rsync log of the part that still managed to run. If there are any files that were still received, action needs to be taken.
 
   2. Check the files received were unzipped - if not, do so manually. Then, write the list of PDB ids of mmcifs that downloaded into a json file and pass its path via env variable `OVERRIDE_IDS_TO_DOWNLOAD_PATH`. Then, run the app again.