NBChub · matinnuhamunada · Jun 9, 2023 · Jan 30, 2023 · Feb 18, 2023 · Mar 3, 2023
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -57,6 +57,8 @@ jobs:
           python-version: 3.x
       - run: pip install git+https://github.com/NBChub/bgcflow_wrapper.git
       - run: pip install pytest-cov
+      - name: Test coverage
+        run: pytest --cov=.tests/unit .tests/unit/
       - name: Build coverage file
         run: pytest --cov=.tests/unit .tests/unit/ > pytest-coverage.txt
       - name: Comment coverage

diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,5 @@ notebooks/
 *.ipynb_checkpoints/
 plugins/
 metabase.db*
+pytest-coverage.txt
+.coverage
diff --git a/...rueckii/tables/df_antismash_6.1.1_bgc.csv → ...kii/tables/df_regions_antismash_6.1.1.csv b/...rueckii/tables/df_antismash_6.1.1_bgc.csv → ...kii/tables/df_regions_antismash_6.1.1.csv
diff --git a/.tests/unit/test_antismash_overview_gather.py b/.tests/unit/test_antismash_overview_gather.py
@@ -1,15 +1,14 @@
 import os
-import sys
-
-import subprocess as sp
-from tempfile import TemporaryDirectory
 import shutil
+import subprocess as sp
+import sys
 from pathlib import Path, PurePosixPath
-
-sys.path.insert(0, os.path.dirname(__file__))
+from tempfile import TemporaryDirectory
 
 import common
 
+sys.path.insert(0, os.path.dirname(__file__))
+
 
 def test_antismash_overview_gather():
 
@@ -21,24 +20,28 @@ def test_antismash_overview_gather():
         shutil.copytree(data_path, workdir)
 
         # dbg
-        print("data/processed/Lactobacillus_delbrueckii/tables/df_antismash_6.1.1_bgc.csv", file=sys.stderr)
-
-        # Run the test job.
-        sp.check_output([
-            "python",
-            "-m",
-            "snakemake", 
+        print(
             "data/processed/Lactobacillus_delbrueckii/tables/df_antismash_6.1.1_bgc.csv",
-            "-f", 
-            "-j1",
-            "--keep-target-files",
+            file=sys.stderr,
+        )
 
-            "--directory",
-            workdir,
-        ])
+        # Run the test job.
+        sp.check_output(
+            [
+                "python",
+                "-m",
+                "snakemake",
+                "data/processed/Lactobacillus_delbrueckii/tables/df_regions_antismash_6.1.1.csv",
+                "-f",
+                "-j1",
+                "--keep-target-files",
+                "--directory",
+                workdir,
+            ]
+        )
 
         # Check the output byte by byte using cmp.
         # To modify this behavior, you can inherit from common.OutputChecker in here
-        # and overwrite the method `compare_files(generated_file, expected_file), 
+        # and overwrite the method `compare_files(generated_file, expected_file),
         # also see common.py.
         common.OutputChecker(data_path, expected_path, workdir).check()
diff --git a/data/interim/diamond/.gitkeep b/data/interim/diamond/.gitkeep
diff --git a/data/interim/mlst/.gitkeep b/data/interim/mlst/.gitkeep
diff --git a/data/interim/refseq_masher/.gitkeep b/data/interim/refseq_masher/.gitkeep
diff --git a/workflow/BGC b/workflow/BGC
@@ -140,7 +140,7 @@ DF_SAMPLES.to_csv(bgcflow_util_dir / "samples.csv", index=False)
 ##### 2. Generate wildcard constants #####
 PROJECT_IDS = list(DF_PROJECTS.name.unique())
 STRAINS = DF_SAMPLES.genome_id.to_list()
-BGCS = STRAINS = DF_SAMPLES.bgc_id.to_list()
+BGCS = DF_SAMPLES.bgc_id.to_list()
 CUSTOM = DF_SAMPLES[DF_SAMPLES.source.eq("custom")].genome_id.to_list()
 NCBI = DF_SAMPLES[DF_SAMPLES.source.eq("ncbi")].genome_id.to_list()
 PATRIC = DF_SAMPLES[DF_SAMPLES.source.eq("patric")].genome_id.to_list()
@@ -179,3 +179,4 @@ include: "rules/antismash.smk"
 include: "rules/bigslice.smk"
 include: "rules/clinker.smk"
 include: "rules/interproscan.smk"
+include: "rules/mmseqs2.smk"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -48,11 +48,14 @@ custom_resource_dir()
 
 
 ##### Target rules #####
+
+final_outputs = get_final_output(DF_SAMPLES, PEP_PROJECTS, rule_dict_path="workflow/rules.yaml")
+
 rule all:
     input:
         expand("data/processed/{name}/tables/df_gtdb_meta.csv", name=PROJECT_IDS),
-        get_final_output(DF_SAMPLES, PEP_PROJECTS, rule_dict_path="workflow/rules.yaml"),
-
+        final_outputs,
+        expand("data/processed/{name}/data_warehouse/tables.log", name=PROJECT_IDS)
 
 
 ##### Modules #####
@@ -79,3 +82,4 @@ include: "rules/bgc.smk"
 include: "rules/diamond.smk"
 include: "rules/deeptfactor.smk"
 include: "rules/cblaster.smk"
+include: "rules/data_warehouse.smk"
diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep.py
@@ -10,7 +10,7 @@
 logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)
 
 
-def bgc_downstream_prep(input_dir, output_dir):
+def bgc_downstream_prep(input_dir, output_dir, selected_bgcs=False):
     """
     Given an antiSMASH directory, check for changed name
     """
@@ -26,6 +26,7 @@ def bgc_downstream_prep(input_dir, output_dir):
     change_log = {genome_id: {}}
 
     for gbk in path.glob("*.gbk"):
+        logging.info(f"Parsing file: {selected_bgcs}")
         logging.info(f"Parsing file: {gbk.name}")
         region = SeqIO.parse(str(gbk), "genbank")
         for record in region:

diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
@@ -0,0 +1,86 @@
+import json
+import logging
+import sys
+from pathlib import Path
+
+from Bio import SeqIO
+
+log_format = "%(levelname)-8s %(asctime)s   %(message)s"
+date_format = "%d/%m %H:%M:%S"
+logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)
+
+
+def bgc_downstream_prep(input_dir, output_dir, selected_bgcs=False):
+    """
+    Given an antiSMASH directory, check for changed name
+    """
+    logging.info(f"Reading input directory: {input_dir}")
+    path = Path(input_dir)
+    if not path.is_dir():
+        raise FileNotFoundError(f"No such file or directory: {path}")
+
+    genome_id = path.name
+    outpath = Path(output_dir) / genome_id
+    outpath.mkdir(parents=True, exist_ok=True)
+    logging.debug(f"Deducting genome id as {genome_id}")
+
+    change_log = {genome_id: {}}
+    ctr = 0
+    matches = [Path(i).stem for i in selected_bgcs.split()]
+    for gbk in path.glob("*.gbk"):
+        if gbk.stem in matches:
+            logging.debug(f"MATCH: {gbk.stem}")
+            ctr = ctr + 1
+            logging.info(f"Parsing file: {gbk.name}")
+            region = SeqIO.parse(str(gbk), "genbank")
+            for record in region:
+                logging.info(f"{gbk} {record.id}")
+                record_log = {}
+                if "comment" in record.annotations:
+                    filename = gbk.name
+                    try:
+                        original_id = record.annotations["structured_comment"][
+                            "antiSMASH-Data"
+                        ]["Original ID"].split()[0]
+                    except KeyError:
+                        original_id = record.id
+                        logging.warning(
+                            f"Found shortened record.id: {record.id} <- {original_id}."
+                        )
+
+                    # generate symlink
+                    new_filename = filename.replace(record.id, original_id)
+                    target_path = Path.cwd() / gbk  # target for symlink
+
+                    link = outpath / new_filename
+
+                    logging.info(f"Generating symlink: {link}")
+                    try:
+                        link.symlink_to(target_path)
+                    except FileExistsError:
+                        logging.warning(
+                            f"Previous symlink exist, updating target: {link} -> {target_path}"
+                        )
+                        link.unlink()
+                        link.symlink_to(target_path)
+
+                    record_log["record_id"] = record.id
+                    record_log["original_id"] = original_id
+                    record_log["target_path"] = str(gbk)
+                    record_log["symlink_path"] = str(link)
+                else:
+                    logging.warning(f"No Comments in record: {gbk.name}")
+
+                change_log[genome_id][filename] = record_log
+    # assert 1+1==3
+    with open(
+        outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
+    ) as json_file:
+        json.dump(change_log, json_file, indent=4)
+
+    logging.info(f"{genome_id}: Job done!\n")
+    return
+
+
+if __name__ == "__main__":
+    bgc_downstream_prep(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/workflow/bgcflow/bgcflow/data/fix_gtdb_taxonomy.py b/workflow/bgcflow/bgcflow/data/fix_gtdb_taxonomy.py
@@ -44,6 +44,16 @@ def summarize_gtdb_json(accession_list, df_gtdb_output):
     # Getting other metadata
     try:
         logging.info("Getting metadata into table...")
+        if "metadata" not in df.columns:
+            logging.warning(
+                "metadata is not in the column information. Adding default values..."
+            )
+            df["metadata"] = [{"genome_id": genome_id} for genome_id in df.index]
+        if "gtdb_release" not in df.columns:
+            logging.warning(
+                "gtdb_release is not in the column information. Adding default values..."
+            )
+            df["gtdb_release"] = "unknown"
         metadata = pd.DataFrame.from_dict(
             {i: df.loc[i, "metadata"] for i in df.index}
         ).T

diff --git a/workflow/bgcflow/bgcflow/data/get_antismash_overview.py b/workflow/bgcflow/bgcflow/data/get_antismash_overview.py
@@ -42,11 +42,15 @@ def get_antismash_overview(json_path, outfile, genome_id=False, n_hits=1):
     with open(path, "r") as f:
         data = json.load(f)
 
+    logging.info(f"Processing: {json_path}, custom genome_id: {genome_id}")
+
     if not genome_id:
         genome_id = data["input_file"].strip(".gbk")
     else:
         pass
 
+    logging.debug(f"Genome id: {genome_id}")
+
     # iterating over record
     output = {}
     for r, record in enumerate(data["records"]):
@@ -94,7 +98,7 @@ def get_antismash_overview(json_path, outfile, genome_id=False, n_hits=1):
 
             bgc_id = f"{record['id']}.region{str(c+1).zfill(3)}"
             output_cluster = {
-                "genome_id": data["input_file"].strip(".gbk"),
+                "genome_id": genome_id,
                 "region": cluster_id,
             }
 
@@ -106,10 +110,24 @@ def get_antismash_overview(json_path, outfile, genome_id=False, n_hits=1):
                 "product",
             ]:
                 output_cluster[column] = region_db[bgc_id][column]
-
-            output_cluster["region_length"] = int(output_cluster["end_pos"]) - int(
-                output_cluster["start_pos"]
-            )
+            try:
+                output_cluster["region_length"] = int(output_cluster["end_pos"]) - int(
+                    output_cluster["start_pos"]
+                )
+            except ValueError:
+                logging.warning(
+                    f'Error calculating region length. Region might be incomplete: {output_cluster["start_pos"]}:{output_cluster["end_pos"]}'
+                )
+                start_pos = "".join(
+                    [s for s in output_cluster["start_pos"] if s.isdigit()]
+                )
+                logging.warning(
+                    f'Correcting start position from {output_cluster["start_pos"]} to {start_pos}'
+                )
+                output_cluster["start_pos"] = start_pos
+                output_cluster["region_length"] = int(output_cluster["end_pos"]) - int(
+                    output_cluster["start_pos"]
+                )
 
             if len(output_hits) == 1:
                 for k in output_hits[0].keys():