From e5e7204038b52e70a9f08bb8d3f849fa59677b8d Mon Sep 17 00:00:00 2001 From: Mike McCann Date: Wed, 5 Jun 2024 19:47:48 +0000 Subject: [PATCH] Add ExcludeFile and methods to read & write _exclude_list files in the SMDB directories --- .vscode/launch.json | 4 +-- smdb/scripts/load.py | 80 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 74 insertions(+), 10 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index ddab3dce..2b01b289 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -40,7 +40,7 @@ //"args": ["-v", "1", "--skipuntil_regex", "--regex", "MappingAUVOps2018/20180725m1/ZTopo.grd$", "--limit", "2"], //"args": [ "-v", "2", "--clobber", "--skipuntil_regex", "--regex", "mbsystem/Data/2012/20120103_MB-System_Tutorial_OFG/ProprietaryOFGSampleData/20101127OFGSample/multibeam_hsx"], //"args": [ "-v", "1", "--clobber", "--noinput", "--skipuntil_regex", "--regex", "MappingAUVOps2014/Sentry_Loihi/2014-glazer/dives/sentry265/multibeam/mbari/ZTopo.grd$", "--limit", "1" ], - "args": [ "-v", "1", "--skipuntil_regex", "--regex", "2021/20210812m1/ZTopo.grd$", "--limit", "10" ], + //"args": [ "-v", "1", "--skipuntil_regex", "--regex", "2021/20210812m1/ZTopo.grd$", "--limit", "10" ], //"args": [ "-v", "1", "--skipuntil_regex", "--regex", "2023/20230511d1/photogrammetry/ZTopo.grd$", "--limit", "1" ], //"args": [ "-v", "1", "--skipuntil_regex", "--regex", "2019/20190820m1/ZTopo.grd$", "--limit", "100" ], //"args": [ "-v", "1", "--skipuntil_regex", "--regex", "2023/20230310m1/ZTopo.grd$", "--limit", "10" ], @@ -83,7 +83,7 @@ //"args": ["--compilation", "-v", "1", "--skipuntil", "2022/AxialSeamount/FiguresCaress", "--limit", "10", "--log_file", "compilation.txt"], //"args": ["--compilation", "-v", "1"], //"args": ["--spreadsheets", "-v", "1"], - //"args": ["-v", "1", "--last_n_days", "0.5"], + "args": ["-v", "1", "--last_n_days", "0.5"], //"args": ["-v", "1", "--spreadsheets", "--parent_dir", "2024", "--append_to_log_file"], //"args": ["-v", "1", "--spreadsheets"], //"args": ["-v", "1", "--compilation", "--last_n_days", "30"], diff --git a/smdb/scripts/load.py b/smdb/scripts/load.py index 130e56bb..1e6c8b59 100755 --- a/smdb/scripts/load.py +++ b/smdb/scripts/load.py @@ -97,7 +97,7 @@ def __init__(self): self._log_levels = (logging.WARN, logging.INFO, logging.DEBUG) self._log_strings = ("WARN", "INFO", "DEBUG") self.commandline = None - self.exclude_paths = [] + self.exclude_paths = set() self.start_proc = datetime.now() def process_command_line(self): @@ -242,12 +242,6 @@ def process_command_line(self): self.logger.addHandler(file_handler) self.logger.setLevel(self._log_levels[self.args.verbose]) - if not self.exclude_paths: - with open(self.args.exclude) as fh: - for line in fh: - if line.startswith("/mbari/SeafloorMapping/"): - self.exclude_paths.append(line.strip()) - if self.args.clobber_log_file: self.logger.info("Saving to new local log file: %s", self.LOCAL_LOG_FILE) else: @@ -1176,7 +1170,7 @@ def load_from_grds(self): exclude_count = 0 if self.args.last_n_days: self.logger.info( - "Loading Missions newer than %d days", self.args.last_n_days + "Loading Missions newer than %f days", self.args.last_n_days ) with subprocess.Popen(locate_cmd, shell=True, stdout=subprocess.PIPE) as proc: for count, fp in enumerate(proc.stdout, start=1): @@ -1756,19 +1750,80 @@ def process_csv(self, xlsx_files_processed: List[str] = None): self.logger.info(f"Wrote {count} Missions to {csv_file}") +class ExcludeFile(BaseLoader): + def read_config_exclude_list(self) -> None: + if not self.exclude_paths: + with open(self.args.exclude) as fh: + for line in fh: + if line.startswith("/mbari/SeafloorMapping/"): + self.exclude_paths.add(line.strip()) + self.logger.info( + f"Read {len(self.exclude_paths)} paths to exclude from {self.args.exclude}" + ) + + def read_exclude_path_xlsxs(self) -> None: + """Walk the /SMDB folders for files named _exlude_list.xlsx' + and load the rows into the exclude_paths set. This is used to skip over directories + that we don't want to process. + """ + count = 0 + for parent_dir in os.listdir(MBARI_DIR): + try: + xlsx_file = os.path.join( + MBARI_DIR, parent_dir, "SMDB", f"{parent_dir}_exclude_list.xlsx" + ) + if not os.path.exists(xlsx_file): + continue + df = pd.read_excel(xlsx_file, engine="openpyxl") + for path in df["path"]: + count += 1 + self.exclude_paths.add(path) + except (FileNotFoundError, NotADirectoryError, PermissionError) as e: + self.logger.debug(f"Could not open {xlsx_file} for reading: {e}") + self.logger.info(f"Read {count} paths to exclude from {xlsx_file}") + + def write_exclude_path_csvs(self) -> None: + """Write the exclude_paths to /SMDB/exlude_list_.csv files""" + # This method may only need to be run once to create the exclude files from + # the original exclude.list file maintained in the git repo in smdb/config/exclude.list + + # Build hash of paths keyed by parent_dir + pd_hash = {} + for path in sorted(self.exclude_paths): + parent_dir = path.split(MBARI_DIR)[1].split("/")[0] + if parent_dir not in pd_hash: + pd_hash[parent_dir] = [] + pd_hash[parent_dir].append(path) + # Write out the exclude files + for parent_dir, paths in pd_hash.items(): + if not os.path.isdir(os.path.join(MBARI_DIR, parent_dir)): + self.logger.warning("No directory found for %s", parent_dir) + csv_file = os.path.join( + MBARI_DIR, parent_dir, "SMDB", f"{parent_dir}_exclude_list.csv" + ) + with open(csv_file, "w") as fh: + fh.write("path\n") + for path in paths: + fh.write(f"{path}\n") + self.logger.info(f"Wrote {len(paths)} paths to {csv_file}") + + def run(*args): # Possible use: https://django-extensions.readthedocs.io/en/latest/runscript.html bl = BaseLoader() bl.process_command_line() bl.logger.debug("Arguments passed to run(): %s", " ".join(args)) if bl.args.bootstrap and bl.args.notes and bl.args.fnv: + exclude_file_load() missions_saved = bootstrap_load() notes_load(missions_saved) fnv_load(missions_saved) elif bl.args.bootstrap and bl.args.notes: + exclude_file_load() missions_saved = bootstrap_load() notes_load(missions_saved) elif bl.args.bootstrap: + exclude_file_load() missions_saved = bootstrap_load() elif bl.args.notes: notes_load(missions_saved) @@ -1781,6 +1836,7 @@ def run(*args): elif bl.args.spreadsheets: spreadsheets_load() else: + exclude_file_load() missions_saved = bootstrap_load() notes_load(missions_saved) fnv_load(missions_saved) @@ -1790,6 +1846,14 @@ def run(*args): bl.save_logger_output() +def exclude_file_load(): + ef = ExcludeFile() + ef.process_command_line() + ef.read_config_exclude_list() + ef.read_exclude_path_xlsxs() + ef.write_exclude_path_csvs() + + def bootstrap_load() -> list: bs = BootStrapper() bs.process_command_line()