vsnp3 version 3.26

USDA-VS · Sep 25, 2024 · 902d5a9 · 902d5a9
1 parent 0974549
commit 902d5a9
Show file tree

Hide file tree

Showing 27 changed files with 58 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ This step combines the VCF files from Step 1 to create SNP matrices and construc
 # Installation
 
 ```bash
-conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.25
+conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.26
 ```
 
 For detailed Miniconda setup instructions, see [conda instructions](./docs/instructions/conda_instructions.md).
@@ -178,6 +178,9 @@ For detailed usage of each script, use the `-h` option.
 
 For information on additional tools, see [Additional Tools](./docs/instructions/additional_tools.md).
 
+## Archived Detail:
+Archived vSNP detail is [here](https://github.com/USDA-VS/vSNP/blob/master/docs/detailed_usage.md)
+
 ##
 
 For more information or support, please open an [issue on the GitHub](https://github.com/USDA-VS/vSNP3/issues) or [email](mailto:[email protected]) directly.

diff --git a/bin/vsnp3_alignment_vcf.py b/bin/vsnp3_alignment_vcf.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import subprocess

diff --git a/bin/vsnp3_annotation.py b/bin/vsnp3_annotation.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import shutil

diff --git a/bin/vsnp3_assembly.py b/bin/vsnp3_assembly.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys

diff --git a/bin/vsnp3_best_reference_sourmash.py b/bin/vsnp3_best_reference_sourmash.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import subprocess

diff --git a/bin/vsnp3_bruc_mlst.py b/bin/vsnp3_bruc_mlst.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import io

diff --git a/bin/vsnp3_download_GCA_fasta_get_metadata.py b/bin/vsnp3_download_GCA_fasta_get_metadata.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys

diff --git a/bin/vsnp3_download_fasta_gbk_gff_by_acc.py b/bin/vsnp3_download_fasta_gbk_gff_by_acc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import argparse

diff --git a/bin/vsnp3_excel_merge_defining_snps.py b/bin/vsnp3_excel_merge_defining_snps.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import re

diff --git a/bin/vsnp3_fasta_to_fastq.py b/bin/vsnp3_fasta_to_fastq.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import gzip
 import os

diff --git a/bin/vsnp3_fasta_to_snps_table.py b/bin/vsnp3_fasta_to_snps_table.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import subprocess
@@ -141,12 +141,13 @@ def get_parsimonious_pos(self, in_df):
 
 class Tables:
 
-    def __init__(self, fasta_alignments=None, df_alignments=None, tree=None, gbk=None, mq=None, write_path=None, groupings_dict=None, table_name=None, debug=False,):
+    def __init__(self, fasta_alignments=None, df_alignments=None, tree=None, gbk=None, mq=None, write_path=None, groupings_dict=None, show_groups=False, table_name=None, debug=False,):
         self.fasta_alignments = fasta_alignments
         self.df_alignments = df_alignments
         self.tree = tree
         self.gbk = gbk
         self.mq = mq
+        self.show_groups = show_groups
         self.debug = debug
         self.st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
         self.groupings_dict = groupings_dict
@@ -289,10 +290,11 @@ def write_out_table(self, df, table_type=None):
             # df = df.append(pd.Series(name='no annotations'))
             df = pd.concat([df, pd.Series(name='no annotations').to_frame().T])
 
-        # Join the list items in the dictionary values into single strings
-        joined_data = {key: '; '.join(map(str, value)) for key, value in self.group_vcfs_dict.items()}
-        # Make groupings into a new Series from the dictionary of sample names: [list group names]
-        new_series = pd.Series(joined_data)
+        if self.show_groups:
+            # Join the list items in the dictionary values into single strings
+            joined_data = {key: '; '.join(map(str, value)) for key, value in self.group_vcfs_dict.items()}
+            # Make groupings into a new Series from the dictionary of sample names: [list group names]
+            new_series = pd.Series(joined_data)
 
         # Check if 'Grouping' column already exists
         # if 'Grouping' not in df.columns:
@@ -309,7 +311,7 @@ def write_out_table(self, df, table_type=None):
                 # print(f'{column_count} columns > {max_size}, cascade table break {count}')
                 chunck_end += max_size
                 df_split = df.iloc[:, chunk_start:chunck_end]
-                if 'Grouping' not in df.columns:
+                if 'Grouping' not in df.columns and self.show_groups:
                     df_split.insert(0, 'Grouping', new_series)
                 df_split.to_json(f'{self.write_path}/df{count}.json', orient='split')
                 self.excel_formatter(f'{self.write_path}/df{count}.json', f'{self.write_path}/{self.table_name}_{table_type}_table{count}-{self.st}.xlsx')
@@ -319,14 +321,14 @@ def write_out_table(self, df, table_type=None):
             count += 1
             # print(f'Last break {column_count} columns, cascade table break {count}')
             df_split = df.iloc[:, chunk_start:]
-            if 'Grouping' not in df.columns:
+            if 'Grouping' not in df.columns and self.show_groups:
                 df_split.insert(0, 'Grouping', new_series)
             df_split.to_json(f'{self.write_path}/df{count}.json', orient='split')
             self.excel_formatter(f'{self.write_path}/df{count}.json', f'{self.write_path}/{self.table_name}_{table_type}_table{count}-{self.st}.xlsx')
             os.remove(f'{self.write_path}/df{count}.json')
         else: # no break needed
             # Insert the new column at position 1 (right after the sample names column)
-            if 'Grouping' not in df.columns:
+            if 'Grouping' not in df.columns and self.show_groups:
                 df.insert(0, 'Grouping', new_series)
             df.to_json(f'{self.write_path}/df.json', orient='split')
             self.excel_formatter(f'{self.write_path}/df.json', f'{self.write_path}/{self.table_name}_{table_type}_table-{self.st}.xlsx')
@@ -354,20 +356,26 @@ def excel_formatter(self, df_json, write_to, group=None):
         formatN = wb.add_format({'bg_color': '#E2CFDD'})
         rows, cols = table_df.shape
 
+        #'first_row', 'first_col', 'last_row', and 'last_col'
+        # Careful that row/column locations don't overlap
+        if self.show_groups:
+            start_col = 2
+        else:
+            start_col = 1
+
         ws.set_column(0, 0, 30)
         ws.set_column(1, cols, 2.1)
-        ws.freeze_panes(2, 2)
+        ws.freeze_panes(2, start_col)
         formatannotation = wb.add_format({'font_color': '#0A028C', 'rotation': '-90', 'align': 'top'})
         #set last row
         ws.set_row(rows + 1, cols + 1, formatannotation)
-
-        #'first_row', 'first_col', 'last_row', and 'last_col'
-        # Careful that row/column locations don't overlap
-        start_col = 2  # This is column C
         end_col = cols
 
         ws.conditional_format(rows - 2, start_col, rows - 1, end_col, {'type': 'cell', 'criteria': '<', 'value': 55, 'format': formatlowqual})
-        ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'cell', 'criteria': '==', 'value': 'C$2', 'format': formatnormal})
+        if self.show_groups:
+            ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'cell', 'criteria': '==', 'value': 'C$2', 'format': formatnormal})
+        else:
+            ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'cell', 'criteria': '==', 'value': 'B$2', 'format': formatnormal})
         ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'text', 'criteria': 'containing', 'value': 'A', 'format': formatA})
         ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'text', 'criteria': 'containing', 'value': 'G', 'format': formatG})
         ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'text', 'criteria': 'containing', 'value': 'C', 'format': formatC})
@@ -506,6 +514,7 @@ def __init__(self, fasta_alignments=None, debug=False,):
 
     parser.add_argument('-f', '--fasta', action='store', dest='fasta', required=True, help='Provide an alignment file in FASTA format')
     parser.add_argument('-p', '--parsimonious', action='store_true', dest='parsimonious', help='Only keep parsimonious SNPs from FASTA alignment file.  This is different than the uninformative SNPs removed via vSNP pipeline.  This is to be used when just working with an aligned FASTA file.')
+    parser.add_argument('--show_groups', action='store_true', dest='show_groups', help='Show group names in SNP table')
     parser.add_argument('-n', '--hash_names', action='store_true', dest='hash_names', help='Hash FASTA names to rid of any RAxML illegal characters')
     parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Optional: Keep debugging files and run without pooling')
     parser.add_argument('-v', '--version', action='version', version=f'{os.path.abspath(__file__)}: version {__version__}')

diff --git a/bin/vsnp3_fastq_stats_seqkit.py b/bin/vsnp3_fastq_stats_seqkit.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import subprocess

diff --git a/bin/vsnp3_file_setup.py b/bin/vsnp3_file_setup.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import shutil

diff --git a/bin/vsnp3_group_on_defining_snps.py b/bin/vsnp3_group_on_defining_snps.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys
@@ -41,7 +41,7 @@ class bcolors:
 class Group():
     ''' 
     '''
-    def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, defining_snps=None, dataframes=None, pickle_file=None, abs_pos=None, group=None, all_vcf=None, find_new_filters=None, no_filters=True, qual_threshold=150, n_threshold=50, mq_threshold=56, hash_groups=None, debug=False):
+    def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, defining_snps=None, dataframes=None, pickle_file=None, abs_pos=None, group=None, all_vcf=None, find_new_filters=None, no_filters=True, qual_threshold=150, n_threshold=50, mq_threshold=56, show_groups=False, hash_groups=None, debug=False):
 
         self.qual_threshold = qual_threshold
         self.n_threshold = n_threshold
@@ -50,6 +50,7 @@ def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, de
         self.vcf_bad_list=[]
         filter_all_list=None
         defining_snps_dict = None
+        self.show_groups = show_groups
         self.debug = debug
 
         if cwd == None:
@@ -560,7 +561,7 @@ def sort_df(self, df):
 
     def raxml_table_build(self, group):
         tree = Tree(fasta_alignments=self.group_fasta_dict[group], write_path=f'{self.cwd}/{group}', tree_name=group)
-        tables = Tables(df_alignments=self.group_dataframe_dict[group], tree=tree.newick, gbk=self.annotation_df, mq=self.average_mq_df, write_path=f'{self.cwd}/{group}', groupings_dict=self.groupings_dict, table_name=group, debug=False)
+        tables = Tables(df_alignments=self.group_dataframe_dict[group], tree=tree.newick, gbk=self.annotation_df, mq=self.average_mq_df, write_path=f'{self.cwd}/{group}', groupings_dict=self.groupings_dict, show_groups=self.show_groups, table_name=group, debug=False)
         tables.build_tables()
         self.raxml_version = tree.raxml_version
 

diff --git a/bin/vsnp3_group_reporter.py b/bin/vsnp3_group_reporter.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import io

diff --git a/bin/vsnp3_html_step2_summary.py b/bin/vsnp3_html_step2_summary.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 

diff --git a/bin/vsnp3_kernel_plots.py b/bin/vsnp3_kernel_plots.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import re

diff --git a/bin/vsnp3_path_adder.py b/bin/vsnp3_path_adder.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import glob

diff --git a/bin/vsnp3_reference_options.py b/bin/vsnp3_reference_options.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys

diff --git a/bin/vsnp3_remove_from_analysis.py b/bin/vsnp3_remove_from_analysis.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys

diff --git a/bin/vsnp3_spoligotype.py b/bin/vsnp3_spoligotype.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import gzip

diff --git a/bin/vsnp3_step1.py b/bin/vsnp3_step1.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys

diff --git a/bin/vsnp3_step2.py b/bin/vsnp3_step2.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import sys
@@ -315,6 +315,7 @@ def __init__(self, runtime=None, vcf_to_df=None, reference=None, groupings_dict=
     parser.add_argument('-abs_pos', '--abs_pos', action='store', dest='abs_pos', required=False, help='Optional: Make a group on defining SNP.  Must be supplied with --group option.  Format as chrom in VCF, chrom:10000.')
     parser.add_argument('-group', '--group', action='store', dest='group', required=False, help='Optional: Name a group on defining SNP.  Must be supplied with --abs_pos option')
     parser.add_argument('-hash', '--hash_groups', action='store_true', dest='hash_groups', required=False, help='Optional: The option will run defining snps marked with a # in the defining snps file.  The # is removed and the defining snps are run.')
+    parser.add_argument('--show_groups', action='store_true', dest='show_groups', help='Show group names in SNP table')
     parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Optional: Keep debugging files and run without pooling.  A pickle file will be kept for troubleshooting to be used directly in vsnp3_group_on_defining_snps.py.  This saves processing time')
     parser.add_argument('-v', '--version', action='version', version=f'{os.path.basename(__file__)}: version {__version__}')
     args = parser.parse_args()
@@ -420,7 +421,7 @@ def zipit(src, dst):
         shutil.copy(args.defining_snps, starting_files) #package with starting files for the record
     zipit(starting_files, starting_files) # zip starting files directory
 
-    group = Group(cwd=global_working_dir, metadata=args.metadata, defining_snps=args.defining_snps, excel_remove=args.remove_by_name, gbk_list=args.gbk, dataframes=vcf_to_df.dataframes, all_vcf=args.all_vcf, find_new_filters=args.find_new_filters, no_filters=args.no_filters, qual_threshold=int(args.qual_threshold), n_threshold=int(args.n_threshold), mq_threshold=int(args.mq_threshold), abs_pos=args.abs_pos, group=args.group, hash_groups=args.hash_groups, debug=args.debug)
+    group = Group(cwd=global_working_dir, metadata=args.metadata, defining_snps=args.defining_snps, excel_remove=args.remove_by_name, gbk_list=args.gbk, dataframes=vcf_to_df.dataframes, all_vcf=args.all_vcf, find_new_filters=args.find_new_filters, no_filters=args.no_filters, qual_threshold=int(args.qual_threshold), n_threshold=int(args.n_threshold), mq_threshold=int(args.mq_threshold), abs_pos=args.abs_pos, group=args.group, show_groups=args.show_groups, hash_groups=args.hash_groups, debug=args.debug)
     vcf_to_df.vcf_bad_list = vcf_to_df.vcf_bad_list + group.vcf_bad_list
 
     setup.print_time()

diff --git a/bin/vsnp3_table_compare.py b/bin/vsnp3_table_compare.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import re

diff --git a/bin/vsnp3_vcf_annotation.py b/bin/vsnp3_vcf_annotation.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import re

diff --git a/bin/vsnp3_vcf_merge_to_fasta.py b/bin/vsnp3_vcf_merge_to_fasta.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import re

diff --git a/bin/vsnp3_zero_coverage.py b/bin/vsnp3_zero_coverage.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-__version__ = "3.25"
+__version__ = "3.26"
 
 import os
 import re