CDCgov · ankushkgupta2 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -57,7 +57,7 @@ tests/__pycache__
 /bin/vadr_outputs/
 
 # bakta related
-/assets
+# /assets
 /assets/bakta_database
 /assets/bakta_database/db_light
 /assets/bakta_database/db_light/

diff --git a/README.md b/README.md
@@ -318,7 +318,7 @@ Table of available entrypoints:
 | only_validation      | Runs the metadata validation process only                           |
 | only_liftoff      | Runs the liftoff annotation process only                           |
 | only_repeatmasker_liftoff | Runs repeatmasker for repeats and liftoff for functional regions, then combines the GFF outputs | 
-| only_vadr         | Runs the VADR annotation process only                           |
+| only_vadr         | Runs the VADR annotation subworkflow only                           |
 | only_bakta        | Runs the Bakta annotation process only                          |
 | only_submission      | Runs submission sub-workflow only. Requires specific inputs mentioned here: [Required Files for Submission Entrypoint](#required-files-for-submission-entrypoint)                           |
 | only_initial_submission | Runs the initial submission process but not follow-up within the submission sub-workflow. Requires specific inputs mentioned here: [Required Files for Submission Entrypoint](#required-files-for-submission-entrypoint)               |

diff --git a/assets/mpox_split_fastas/FL0004_1.fasta b/assets/mpox_split_fastas/FL0004_1.fasta
diff --git a/assets/mpox_split_fastas/FL0015_2.fasta b/assets/mpox_split_fastas/FL0015_2.fasta
diff --git a/assets/mpox_split_fastas/IL0005_3.fasta b/assets/mpox_split_fastas/IL0005_3.fasta
diff --git a/assets/mpox_split_fastas/NY0006_4.fasta b/assets/mpox_split_fastas/NY0006_4.fasta
diff --git a/assets/mpox_split_fastas/NY0007_5.fasta b/assets/mpox_split_fastas/NY0007_5.fasta
diff --git a/assets/mpox_split_fastas/OH0002_6.fasta b/assets/mpox_split_fastas/OH0002_6.fasta
diff --git a/assets/mpox_split_fastas/TX0001_7.fasta b/assets/mpox_split_fastas/TX0001_7.fasta
diff --git a/assets/mpox_split_gffs/FL0004.tsv b/assets/mpox_split_gffs/FL0004.tsv
@@ -0,0 +1 @@
+FL0004.tsv
diff --git a/assets/mpox_split_gffs/FL0004_reformatted.gff b/assets/mpox_split_gffs/FL0004_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/FL0004_reformatted.gff
diff --git a/assets/mpox_split_gffs/FL0015.tsv b/assets/mpox_split_gffs/FL0015.tsv
@@ -0,0 +1 @@
+FL0015.tsv
diff --git a/assets/mpox_split_gffs/FL0015_reformatted.gff b/assets/mpox_split_gffs/FL0015_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/FL0015_reformatted.gff
diff --git a/assets/mpox_split_gffs/IL0005.tsv b/assets/mpox_split_gffs/IL0005.tsv
@@ -0,0 +1 @@
+IL0005.tsv
diff --git a/assets/mpox_split_gffs/IL0005_reformatted.gff b/assets/mpox_split_gffs/IL0005_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/IL0005_reformatted.gff
diff --git a/assets/mpox_split_gffs/NY0006.tsv b/assets/mpox_split_gffs/NY0006.tsv
@@ -0,0 +1 @@
+NY0006.tsv
diff --git a/assets/mpox_split_gffs/NY0006_reformatted.gff b/assets/mpox_split_gffs/NY0006_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/NY0006_reformatted.gff
diff --git a/assets/mpox_split_gffs/NY0007.tsv b/assets/mpox_split_gffs/NY0007.tsv
@@ -0,0 +1 @@
+NY0007.tsv
diff --git a/assets/mpox_split_gffs/NY0007_reformatted.gff b/assets/mpox_split_gffs/NY0007_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/NY0007_reformatted.gff
diff --git a/assets/mpox_split_gffs/OH0002.tsv b/assets/mpox_split_gffs/OH0002.tsv
@@ -0,0 +1 @@
+OH0002.tsv
diff --git a/assets/mpox_split_gffs/OH0002_reformatted.gff b/assets/mpox_split_gffs/OH0002_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/OH0002_reformatted.gff
diff --git a/assets/mpox_split_gffs/TX0001.tsv b/assets/mpox_split_gffs/TX0001.tsv
@@ -0,0 +1 @@
+TX0001.tsv
diff --git a/assets/mpox_split_gffs/TX0001_reformatted.gff b/assets/mpox_split_gffs/TX0001_reformatted.gff
@@ -0,0 +1 @@
+/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/TX0001_reformatted.gff
diff --git a/bin/annotation_utility.py b/bin/annotation_utility.py
@@ -12,60 +12,7 @@ class MainUtility:
     def __init__(self):
         """
         """
-
-    def check_fasta_names(self, meta_df, input_fasta_path, output_fasta_path, fasta_column):
-        """ 
-        Checks the name of the FASTA files to make sure it is aligned with the sample name from metadata
-        Places these modified FASTA files into new created directory
-        """
-        for index, row in meta_df.iterrows():
-            # get the sample name 
-            sample_name = row['sample_name']
-            # get the fasta file name
-            fasta_file_name = row[fasta_column]
-
-            # copy the file with the new name
-            shutil.copy(f"{input_fasta_path}/{fasta_file_name}", f"{output_fasta_path}/{sample_name}.fasta")
-
-    def check_fasta_path(self, meta_df, fasta_path):
-        """ This checks the fasta_path passed in and the fasta paths within the metadata sheet, to make sure they are aligned
-        """
-        # try finding a fasta_path field
-        if 'fasta_file' in [x.strip().lower() for x in meta_df.columns]:
-            fasta_column = 'fasta_file'
-        elif 'fasta_file_name' in [x.strip().lower() for x in meta_df.columns]:
-            fasta_column = 'fasta_file_name'
-        elif 'fasta_name' in [x.strip().lower() for x in meta_df.columns]:
-            fasta_column = 'fasta_name'
-        elif 'fasta' in  [x.strip().lower() for x in meta_df.columns]:
-            fasta_column = 'fasta'
-        else:
-            # try some other variant as last resort 
-            # get all fields with fasta in it
-            fasta_fields = [x for x in meta_df.columns if 'fasta' in x.strip().lower()]
-            if fasta_fields:
-                # just use the first field to get values
-                fasta_column = fasta_fields[0]
-            else:
-                raise Exception(f"Could not find the column named fasta_file_name within metadata sheet. Please make sure it exists")
-
-        # get these values 
-        fasta_file_names = meta_df[fasta_column].to_list()
-
-        # now cycle through these values and make sure they are no repeats for different sample names
-        if len(set(fasta_file_names)) != len(fasta_file_names):
-            raise Exception("Cannot have multiple samples in metadata sheet pointing to same FASTA file")
 
-        # check all of them are located at fasta_path location
-        list_of_fastas = [file for file in os.listdir(fasta_path) if file.endswith(".fasta")]
-        for name in fasta_file_names:
-            try:
-                assert name.strip() in list_of_fastas 
-            except:
-                raise AssertionError(f"Missing {name} from the directory named {fasta_path}. Only files in dir are: {list_of_fastas}")
-
-        return fasta_column
-
     def split_fasta(self, fasta_path, fasta_output):
         """
         Parses fasta file and writes it

diff --git a/bin/general_utility.py b/bin/general_utility.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import os 
+import shutil
+
+from annotation_utility import MainUtility as main_util
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Parameters for General Utility Functions")
+    parser.add_argument("--check_fasta_paths", type=bool, default=False, help="Flag whether to check FASTA paths or not")
+    parser.add_argument("--check_fasta_names", type=bool, default=False, help="Flag whether to check FASTA names or not")
+    parser.add_argument("--meta_path", type=str, help="Path to the input metadata file")
+    parser.add_argument("--fasta_path", type=str, help="Path to the fasta file(s)")
+    parser.add_argument("--output_fasta_path", type=str, help="Path to the output directory for fasta file(s)")
+    return parser
+
+def main(): 
+
+    # get the parameters
+    args = get_args().parse_args()
+    parameters = vars(args)
+
+    # instantiate the class 
+    general_util = GeneralUtil()
+
+    # convert the metadata file to pandas dataframe 
+    meta_df = pd.read_excel(parameters['meta_path'], header=[1])
+
+    # check if need to perform checks for fasta path 
+    if parameters['check_fasta_paths']:
+        # call the check_fasta_path function
+        fasta_column = general_util.check_fasta_path (
+            meta_df=meta_df,
+            fasta_path=parameters['fasta_path']
+        )
+
+    # check if need to perform checks for fasta file names 
+    if parameters['check_fasta_names']:
+        general_util.check_fasta_names (
+            meta_df=meta_df, 
+            input_fasta_path=parameters['fasta_path'], 
+            output_fasta_path=parameters['output_fasta_path'], 
+            fasta_column=fasta_column
+        )
+
+
+class GeneralUtil():
+    def __init__(self):
+        self.main_util = main_util()
+
+    @staticmethod
+    def check_fasta_path(meta_df, fasta_path):
+        """ This checks the fasta_path passed in and the fasta paths within the metadata sheet, to make sure they are aligned
+        """
+        # try finding a fasta_path field
+        if 'fasta_file' in [x.strip().lower() for x in meta_df.columns]:
+            fasta_column = 'fasta_file'
+        elif 'fasta_file_name' in [x.strip().lower() for x in meta_df.columns]:
+            fasta_column = 'fasta_file_name'
+        elif 'fasta_name' in [x.strip().lower() for x in meta_df.columns]:
+            fasta_column = 'fasta_name'
+        elif 'fasta' in  [x.strip().lower() for x in meta_df.columns]:
+            fasta_column = 'fasta'
+        else:
+            # try some other variant as last resort 
+            # get all fields with fasta in it
+            fasta_fields = [x for x in meta_df.columns if 'fasta' in x.strip().lower()]
+            if fasta_fields:
+                # just use the first field to get values
+                fasta_column = fasta_fields[0]
+            else:
+                raise Exception(f"Could not find the column named fasta_file_name within metadata sheet. Please make sure it exists")
+
+        # get these values 
+        fasta_file_names = meta_df[fasta_column].to_list()
+
+        # now cycle through these values and make sure they are no repeats for different sample names
+        if len(set(fasta_file_names)) != len(fasta_file_names):
+            raise Exception("Cannot have multiple samples in metadata sheet pointing to same FASTA file")
+
+        # check all of them are located at fasta_path location
+        list_of_fastas = [file for file in os.listdir(fasta_path) if file.endswith(".fasta")]
+        for name in fasta_file_names:
+            try:
+                assert name.strip() in list_of_fastas 
+            except:
+                raise AssertionError(f"Missing {name} from the directory named {fasta_path}. Only files in dir are: {list_of_fastas}")
+
+        return fasta_column
+
+    @staticmethod
+    def check_fasta_names(meta_df, input_fasta_path, output_fasta_path, fasta_column):
+        """ 
+        Checks the name of the FASTA files to make sure it is aligned with the sample name from metadata
+        Places these modified FASTA files into new created directory
+        """
+        # create the output directory
+        os.system(f'mkdir -p -m777 {output_fasta_path}')
+
+        for index, row in meta_df.iterrows():
+            # get the sample name 
+            sample_name = row['sample_name']
+            # get the fasta file name
+            fasta_file_name = row[fasta_column]
+
+            # copy the file with the new name
+            shutil.copy(f"{input_fasta_path}/{fasta_file_name}", f"{output_fasta_path}/{sample_name}.fasta")
+
+    def check_if_unzip(fasta_path):
+        """ This checks if the fasta_path points to a zipped .tz file containing individual fasta files
+        """
+        # check if the fasta file needs to be unzipped first
+        if fasta_path.split('/')[-1].split('.')[-1] == 'gz':
+            fasta_path = self.main_util.unzip_fasta(fasta_path=fasta_path)
+        fasta_names = self.main_util.get_fasta_sample_names(fasta_path=fasta_path)
+        return fasta_names
+        # TODO: NEED TO CHECK THIS UNZIPPED SITUATION
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/liftoff_submission.py b/bin/liftoff_submission.py
@@ -112,7 +112,7 @@ def get_args():
 		"""
 		parser = argparse.ArgumentParser(description="Parameters for Running Liftoff Submission")
 		# Required Arguments
-		parser.add_argument("--fasta_path", type=str, help="Non reference path to input Multi Fasta file \n")
+		parser.add_argument("--fasta_path", type=str, help="Non reference path to directory containing single sample FASTA file \n")
 		parser.add_argument("--ref_fasta_path", type=str, help="Reference path to fasta file \n")
 		parser.add_argument("--meta_path", type=str, help="Path to excel spreadsheet for MetaData \n")
 		parser.add_argument("--ref_gff_path", type=str, help="Path to the input gff file.... expects gff3 format")
@@ -194,18 +194,13 @@ def prep_main(self):
 		# load the meta data file
 		self.load_meta()
 
-		# checks whether samples are shared between meta and fasta
-		fasta_column = self.main_util.check_fasta_path (
-			meta_df=self.meta_df, 
-			fasta_path=self.parameters['fasta_path']
-		)
-		# checks the name of the fasta file is aligned with sample name
-		self.main_util.check_fasta_names (
-			meta_df=self.meta_df, 
-			input_fasta_path=self.parameters['fasta_path'], 
-			output_fasta_path=f"{self.parameters['fasta_temp']}",
-			fasta_column=fasta_column
-		)
+		# move the fasta files over to the temp directory 
+		for index, row in self.meta_df.iterrows():
+			# get the sample name 
+			sample_name = row['sample_name']
+			# copy the file over based on the sample name
+			shutil.copy(f"{self.parameters['fasta_path']}/{sample_name}.fasta", f"{self.parameters['fasta_temp']}/{sample_name}.fasta")
+
 		# get the length of sequences for each sample
 		self.get_seq_lens()
 

diff --git a/bin/post_vadr_cleanup.py b/bin/post_vadr_cleanup.py
@@ -22,18 +22,12 @@ def vadr_main():
 
     # initialize the directory structure + move the original input files into this location
     shutil.copytree(parameters['vadr_outputs'], f"{parameters['vadr_outdir']}/{meta_filename}/original_outputs")
-    for dir_name in ['', 'fasta', 'gffs', 'tbl', 'errors']:
+    for dir_name in ['', 'gffs', 'tbl', 'errors']:
         os.mkdir(f"{parameters['output_path']}/{dir_name}")
 
     # instantiate the class object 
     main_funcs = MainVADRFuncs(parameters)
 
-    # split the fasta file and save it
-    main_util.split_fasta (
-        fasta_path=parameters['fasta_path'], 
-        fasta_output=f"{parameters['output_path']}/fasta/"
-    )
-
     # split the outputted tables into separate samples
     main_funcs.split_table()
 
@@ -53,7 +47,6 @@ def get_args():
     parser = argparse.ArgumentParser(description="Parameters for Running VADR Annotation")
     parser.add_argument("--vadr_outdir", type=str, default='vadr_outputs', help="Name of vadr output directory")
     parser.add_argument("--vadr_outputs", type=str, help="Path to the vadr outputs")
-    parser.add_argument("--fasta_path", type=str, help="Path to the input fasta file")
     parser.add_argument("--meta_path", type=str, help="Path to the input metadata file")
     return parser
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/Users/ankushgupta/Documents/tostadas/nf_test_results/liftoff_outputs/MPXV_metadata_Sample_Run_1/liftoff/FL0004_reformatted.gff