-
Notifications
You must be signed in to change notification settings - Fork 0
/
mashscreen2json.py
107 lines (83 loc) · 3.36 KB
/
mashscreen2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
"""
Purpose
-------
This module is intended to generate a json output for mash screen results that
can be imported in pATLAS.
Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``mash_output`` : String with the name of the mash screen output file.
- e.g.: ``'sortedMashScreenResults_SampleA.txt'``
Code documentation
------------------
"""
__version__ = "1.0.1"
__build__ = "20022018"
__template__ = "mashscreen2json-nf"
from statistics import median
import os
import json
from assemblerflow_utils.assemblerflow_base import get_logger, MainWrapper
logger = get_logger(__file__)
if __file__.endswith(".command.sh"):
MASH_TXT = '$mashtxt'
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("MASH_TXT: {}".format(MASH_TXT))
@MainWrapper
def main(mash_output):
'''
converts top results from mash screen txt output to json format
Parameters
----------
mash_output: str
this is a string that stores the path to this file, i.e, the name of
the file
'''
logger.info("Reading file : {}".format(mash_output))
read_mash_output = open(mash_output)
dic = {}
median_list = []
filtered_dic = {}
logger.info("Generating dictionary and list to pre-process the final json")
for line in read_mash_output:
tab_split = line.split("\t")
identity = tab_split[0]
#shared_hashes = tab_split[1]
median_multiplicity = tab_split[2]
#p_value = tab_split[3]
query_id = tab_split[4]
#query-comment should not exist here and it is irrelevant
# here identity is what in fact interests to report to json but
# median_multiplicity also is important since it gives an rough
# estimation of the coverage depth for each plasmid.
# Plasmids should have higher coverage depth due to their increased
# copy number in relation to the chromosome.
dic[query_id] = [identity, median_multiplicity]
median_list.append(float(median_multiplicity))
output_json = open(" ".join(mash_output.split(".")[:-1]) + ".json", "w")
# median cutoff is twice the median of all median_multiplicity values
# reported by mash screen. In the case of plasmids, since the database
# has 9k entries and reads shouldn't have that many sequences it seems ok...
if len(median_list) > 0:
# this statement assures that median_list has indeed any entries
median_cutoff = median(median_list)
logger.info("Generating final json to dump to a file")
for k, v in dic.items():
# estimated copy number
copy_number = int(float(v[1]) / median_cutoff)
# assure that plasmid as at least twice the median coverage depth
if float(v[1]) > median_cutoff:
filtered_dic["_".join(k.split("_")[0:3])] = [v[0],
str(copy_number)]
logger.info(
"Exported dictionary has {} entries".format(len(filtered_dic)))
else:
# if no entries were found raise an error
logger.error("No matches were found using mash screen for the queried reads")
output_json.write(json.dumps(filtered_dic))
output_json.close()
if __name__ == "__main__":
main(MASH_TXT)