Skip to content

Commit

Permalink
Merge pull request #29 from sb-ncbr/version_rsync_logs
Browse files Browse the repository at this point in the history
Version rsync logs
  • Loading branch information
hojkas authored Jun 1, 2024
2 parents 20786d2 + 0904eda commit 637680a
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 17 deletions.
23 changes: 22 additions & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass, field
from typing import Optional

from src.utils import get_formatted_date, int_from_env, bool_from_env, int_list_from_env
from src.utils import get_formatted_date, int_from_env, bool_from_env, int_list_from_env, get_formatted_timestamp


@dataclass(slots=True)
Expand Down Expand Up @@ -143,6 +143,7 @@ class FilepathConfig:
# logs
_full_log_name: str = env.get("FULL_LOG_NAME", "full_log.txt")
_filtered_log_name: str = env.get("FILTERED_LOG_NAME", "filtered_log.txt")
_run_start_timesetamp: str = get_formatted_timestamp()

@property
def rest_jsons(self) -> str:
Expand Down Expand Up @@ -212,6 +213,14 @@ def key_trends_versions_json(self) -> str:
def old_crunched_csv(self) -> str:
return path.join(self.output_root_path, self._old_crunched_csv_name)

@property
def archive_rsync_logs(self) -> str:
return path.join(self.logs_root_path, "rsync_log_history")

@property
def archive_app_logs(self) -> str:
return path.join(self.logs_root_path, "full_log_history")

@property
def full_log(self) -> str:
return path.join(self.logs_root_path, self._full_log_name)
Expand All @@ -220,6 +229,18 @@ def full_log(self) -> str:
def filtered_log(self) -> str:
return path.join(self.logs_root_path, self._filtered_log_name)

@property
def archive_full_log(self) -> str:
return path.join(self.archive_app_logs, f"{self._run_start_timesetamp}_{self._full_log_name}")

@property
def mmcif_rsync_log(self) -> str:
return path.join(self.archive_rsync_logs, f"{self._run_start_timesetamp}_mmcif_rsync_log.txt")

@property
def xml_rsync_log(self) -> str:
return path.join(self.archive_rsync_logs, f"{self._run_start_timesetamp}_xml_rsync_log.txt")


@dataclass(slots=True)
class DownloadTimeoutConfig:
Expand Down
2 changes: 2 additions & 0 deletions src/data_download/data_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def sync_pdbe_mmcif_via_rsync(config: Config) -> Optional[ChangedIds]:
RsyncDataType.ARCHIVE_MMCIF,
config.filepaths.gz_pdb_mmcifs,
config.filepaths.pdb_mmcifs,
config.filepaths.mmcif_rsync_log
)
changed_ids = ChangedIds(
updated=rsync_log.get_successful_recieved_ids(),
Expand Down Expand Up @@ -198,6 +199,7 @@ def rsync_xml_validation_files(config: Config) -> list[str]:
RsyncDataType.XML_VALIDATION_REPORTS,
config.filepaths.gz_xml_reports,
config.filepaths.xml_reports,
config.filepaths.xml_rsync_log
)
logging.info("Rsync of validation xml files finished successfully.")
return rsync_log.get_successful_recieved_ids()
Expand Down
43 changes: 30 additions & 13 deletions src/data_download/rsync_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from enum import Enum
from typing import Optional

from src.exception import DataDownloadError
from src.exception import DataDownloadError, ParsingError
from src.generic_file_handlers.plain_file_handler import load_file


@dataclass(slots=True)
Expand Down Expand Up @@ -64,21 +65,36 @@ class RsyncDataType(Enum):
XML_VALIDATION_REPORTS = 1


def rsync_and_unzip(rsync_data_type: RsyncDataType, gzip_folder: str, unpacked_folder: str) -> RsyncLog:
def rsync_and_unzip(
rsync_data_type: RsyncDataType, gzip_folder: str, unpacked_folder: str, log_location: str
) -> RsyncLog:
"""
Assembles rsync command based on rsync data type. Runs the command. Parses the rsync log to get recieved
and deleted files entries. Unzips the gz from the files for recieved files, removes the removed from unzipped.
:param rsync_data_type: Type of data to rsync. Rsync command is assembled based on it, and logs are parsed
differently for each type (based on file extensions associated with the type).
:param gzip_folder: Path to folder where .gz versions are stored. This is the rsync target.
:param unpacked_folder: Folder where unpacked files are stored.
:param log_location: Path of the log to be created and used.
:return: Parsed rsync log.
"""
rsync_command = _assemble_rsync_command(rsync_data_type, gzip_folder)
logging.info("Running rsync command: '%s'", " ".join(rsync_command))
rsync_raw_log = _run_rsync_command(rsync_command)
_run_rsync_command(rsync_command, log_location)
logging.info("Rsync of gzip files done. Next: parsing log.")

try:
rsync_raw_log = load_file(log_location)
except ParsingError as ex:
logging.critical(
"Failed to open rsync log %s. The changes to the files mentioned inside are not "
"processed. You need to see the log for recieved files, and either delete those gip files and"
"run the data download again unchanged, or unzip them yourself and then run the data download again"
"with the changed ids passed as OVERRIDE_IDS_TO_DOWNLOAD_PATH.",
log_location
)
raise DataDownloadError(f"Failed to open rsync log {log_location}. {ex}") from ex

if rsync_data_type == RsyncDataType.ARCHIVE_MMCIF:
rsync_log = _parse_rsync_log_for_mmcif(rsync_raw_log)
elif rsync_data_type == RsyncDataType.XML_VALIDATION_REPORTS:
Expand Down Expand Up @@ -124,22 +140,23 @@ def _assemble_rsync_command(rsync_data_type: RsyncDataType, target_folder_path:
raise DataDownloadError(f"Unsupported rsync type {rsync_data_type}.")

command.append(target_folder_path)

return command


def _run_rsync_command(rsync_command: list[str]) -> str:
def _run_rsync_command(rsync_command: list[str], log_location: str) -> None:
try:
completed_process = subprocess.run(rsync_command, check=True, capture_output=True)
return completed_process.stdout.decode("utf8", errors="ignore")
with open(log_location, "w", encoding="utf8") as f:
subprocess.run(rsync_command, check=True, stdout=f)
except subprocess.CalledProcessError as ex:
logging.error(
"Called subproccess error:\n%s\n",
ex.stderr.decode("utf8", errors="ignore").strip(),
)
logging.error("Called subproccess error:\n%s\n", ex)
logging.critical(
"Check the following log. If any items were recieved, manual actions are needed (unzipping the recieved "
"files; and running download with overriden ids to download for these in case of mmcifs).\n%s\n",
ex.stdout.decode("utf8", errors="ignore").strip(),
"Failed to finish rsync command. See newest rsync log to see files that were already synced "
"(file %s). The changes to the files mentioned inside are not "
"processed. You need to see the log for recieved files, and either delete those gip files and"
"run the data download again unchanged, or unzip them yourself and then run the data download again"
"with the changed ids passed as OVERRIDE_IDS_TO_DOWNLOAD_PATH.",
log_location
)
raise DataDownloadError(f"Rsync failed: {ex}") from ex

Expand Down
15 changes: 14 additions & 1 deletion src/generic_file_handlers/plain_file_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging

from src.exception import FileWritingError
from src.exception import FileWritingError, ParsingError


def write_file(filepath: str, content: str) -> None:
Expand All @@ -16,3 +16,16 @@ def write_file(filepath: str, content: str) -> None:
logging.info("Saved file %s", filepath)
except OSError as ex:
raise FileWritingError(f"Failed to write file {filepath}: {ex}") from ex


def load_file(filepath: str) -> str:
"""
Load content from given filepath.
:param filepath:
:return: String representation of file content.
"""
try:
with open(filepath, "r", encoding="utf8") as f:
return f.read()
except OSError as ex:
raise ParsingError(f"Failed to read file {filepath}: {ex}") from ex
10 changes: 10 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def prepare_log_folder(config: Config) -> None:
if os.path.exists(config.filepaths.full_log):
os.remove(config.filepaths.full_log)

if not os.path.exists(config.filepaths.archive_rsync_logs):
os.mkdir(config.filepaths.archive_rsync_logs)
if not os.path.exists(config.filepaths.archive_app_logs):
os.mkdir(config.filepaths.archive_app_logs)


def configure_logging(config: Config) -> None:
"""
Expand All @@ -38,6 +43,10 @@ def configure_logging(config: Config) -> None:
full_log_file_handler = logging.FileHandler(config.filepaths.full_log)
full_log_file_handler.setFormatter(formatter)

# logging to a timestamped file archive in full
archive_log_file_handler = logging.FileHandler(config.filepaths.archive_full_log)
archive_log_file_handler.setFormatter(formatter)

# logging to file filtering info/debug out
filtered_log_file_handler = logging.FileHandler(config.filepaths.filtered_log)
filtered_log_file_handler.setFormatter(formatter)
Expand All @@ -48,6 +57,7 @@ def configure_logging(config: Config) -> None:
stream_handler.setFormatter(formatter)

root_logger.addHandler(full_log_file_handler)
root_logger.addHandler(archive_log_file_handler)
root_logger.addHandler(filtered_log_file_handler)
root_logger.addHandler(stream_handler)

Expand Down
8 changes: 8 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,14 @@ def get_formatted_date() -> str:
return datetime.now().strftime("%Y%m%d")


def get_formatted_timestamp() -> str:
"""
Get current time in the format %Y%m%d_%H%M%S.
:return: The timestamp string
"""
return datetime.now().strftime("%Y%m%d_%H%M%S")


def find_matching_files(folder_path: str, string_to_match: str) -> list[str]:
"""
Find files in given folder path that contain given string exactly.
Expand Down
8 changes: 6 additions & 2 deletions user_guide.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# User guide

If at any point the aplication fails, check the `logs/` folder and its subfolders for information about
what happened, if anything needs to be done and how to recover. Take special care if a `CRITICAL` log level
is ever encountered, as that requires manual actions to avoid losing information.

1. [Package management](#package-management)
2. [Local development](#local-development)
1. [Docker](#docker)
Expand Down Expand Up @@ -363,7 +367,7 @@ Data download is done as follows:
First, files in location `./dataset/gz_PDBe_mmCIF/` are synced to the rsync endpoint `rsync.rcsb.org::ftp_data/structures/divided/mmCIF/`. These files have `.cif.gz` extensions.
The output of this operation is parsed to see which files were received or removed, and what are their PDB ids are.
The output of this operation is parsed to see which files were received or removed, and what are their PDB ids are. The log is also saved into its log subfolder.
Based on the rsync log, received files are unzipped into plain `.cif` to folder `./dataset/PDBe_mmCIF/`. Those deleted are deleted from there as well. The list of ids that changed or were removed is saved for further processing.
Expand Down Expand Up @@ -413,7 +417,7 @@ In cases not mentioned here, follow the WARNING and ERROR levels of logs (stored
- **App fails during the rsync of structure mmCIF files and no other actions are done.**
1. Check the logs for raw rsync log of the part that still managed to run. If there are any files that were still received, action needs to be taken.
1. Check the `logs/rsync_log_history/` for raw rsync log of the part that still managed to run. If there are any files that were still received, action needs to be taken.
2. Check the files received were unzipped - if not, do so manually. Then, write the list of PDB ids of mmcifs that downloaded into a json file and pass its path via env variable `OVERRIDE_IDS_TO_DOWNLOAD_PATH`. Then, run the app again.
Expand Down

0 comments on commit 637680a

Please sign in to comment.