From ba346907e8391cc389144a8e36e05f34c0ce4f3e Mon Sep 17 00:00:00 2001
From: Jenya Kopylova <jenya.kopylov@gmail.com>
Date: Thu, 27 Oct 2016 15:06:12 +0200
Subject: [PATCH] script to generate RepoPhlAn quality genome list and
 associated taxonomies

---
 python_scripts/repophlan_genomeID_taxonomy.py | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 python_scripts/repophlan_genomeID_taxonomy.py

diff --git a/python_scripts/repophlan_genomeID_taxonomy.py b/python_scripts/repophlan_genomeID_taxonomy.py
new file mode 100644
index 0000000..c1d63a6
--- /dev/null
+++ b/python_scripts/repophlan_genomeID_taxonomy.py
@@ -0,0 +1,126 @@
+#!/bin/python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2016--, Evguenia Kopylova.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Generate tsv file with RepoPhlAn quality genomes and their taxonomies.
+
+import sys
+import bz2
+from skbio import Sequence
+import skbio
+import click
+
+from os import walk
+from os.path import join, splitext, basename, isfile
+
+
+def get_genome_paths(scores_average_fp, scores_repophlan_fp):
+    """Return genome ID, tax_id and .fna.bz2 path.
+
+    Parameters
+    ----------
+    scores_average_fp: str
+        RepoPhlAn screened report
+    scores_repophlan_fp: str
+        RepoPhlAn's output report from repophlan_get_microbes.py
+
+    Returns
+    -------
+    genomes: dict
+        keys are RepoPhlAn genome IDs and values are list of taxonomy str and
+        FASTA filepath
+    genomes_without_tax_id: list
+        RepoPhlAn genome IDs without a listed taxonomy
+    """
+    genomes = {}
+    # Get quality filtered genome ID and filepath to .fna.bz2
+    with open(scores_average_fp) as scores_average_f:
+        # header
+        line = scores_average_f.readline()
+        line = line.strip().split('\t')
+        fna_idx = line.index('fna_lname')
+        for line in scores_average_f:
+            line = line.strip().split('\t')
+            genome_id = line[0]
+            fna_fp = line[fna_idx]
+            if genome_id not in genomes:
+                genomes[genome_id] = [fna_fp]
+            else:
+                raise ValueError("Duplicate genome IDs %s" % genome_id)
+    # Get taxonomy
+    genomes_without_taxonomy = []
+    with open(scores_repophlan_fp) as scores_repophlan_f:
+        # header
+        line = scores_repophlan_f.readline()
+        line = line.strip().split('\t')
+        tax_id_idx = line.index('taxonomy')
+        for line in scores_repophlan_f:
+            line = line.strip().split('\t')
+            genome_id = line[0]
+            # only want tax_ids for genomes passing quality filter
+            if genome_id in genomes:
+                taxonomy = line[tax_id_idx]
+                # tax_id must be an integer, if not check the field
+                # immediately prior (bug in repophlan parsing)
+                if 'k__' not in taxonomy:
+                    taxonomy = line[tax_id_idx-1]
+                    if 'k__' not in taxonomy:
+                        genomes_without_taxonomy.append(genome_id)
+                        continue
+                genomes[genome_id].append(taxonomy)
+    return genomes, genomes_without_taxonomy
+
+
+def output_results(genomes, output_fp):
+    """Format FASTA labels in qualified genomes.
+
+    Parameters
+    ----------
+    genomes: dict
+        keys are RepoPhlAn genome IDs and values are list of taxonomy str and
+        FASTA filepath
+    output_fp: str
+        Output filepath        
+    """
+    with open(output_fp, 'w') as output_f:
+        for genome, info in genomes.items():
+            output_f.write('%s\t%s\t%s\n' % (genome, info[0], info[1]))
+
+
+@click.command()
+@click.option('--repophlan-scores-fp', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=True,
+                              file_okay=True),
+              help="RepoPhlAn's output report from repophlan_get_microbes.py")
+# Output of Jon's score average script
+# https://github.com/tanaes/script_bin/blob/master/filter_repophlan.py
+@click.option('--repophlan-scores-average-fp', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=True,
+                              file_okay=True),
+              help="RepoPhlAn's screened report")
+@click.option('--output-fp', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=False,
+                              file_okay=True),
+              help="Output filepath")
+def main(repophlan_scores_fp,
+         repophlan_scores_average_fp,
+         output_fp):
+    """Edit qualified genomes' labels to Kraken format.
+    """
+    genomes, genomes_without_taxonomy = get_genome_paths(
+        repophlan_scores_average_fp, repophlan_scores_fp)
+    if len(genomes_without_taxonomy) != 0:
+        raise ValueError(
+            "Some genomes do not have a taxid: %s" % genomes_without_taxonomy)
+
+    output_results(genomes, output_fp)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file