-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsearch.py
91 lines (82 loc) · 3.5 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# -*- coding: utf-8 -*-
################################################################################
##Accession Search##
""""Reads through the result files and populates dictionary with sample[accessionList] that are above detection threshhold """
################################################################################
#Declarations
import os, re, csv
my_dir = os.getcwd()
term = "cen"
#thresh = 5
fileList = []
sampleDict = {}
#______________________________________________________________________________#
def createFileList(my_dir, term):
"""OS Walk to create file list based on a search term and a path"""
for subdir, dirs, files in os.walk(my_dir):
for file in files:
if re.search(re.escape(term), file):
result = os.path.join(subdir, file)
fileList.append(result)
return fileList
#______________________________________________________________________________#
def readCensusresultFile (file, dictionary, blacklist, threshold=10 ):
"""Takes an absolute file path, dictionary, a csv, and an optional threshold as input and returns a dictionary with taxId keys and accession number list as values"""
sample = file.split('_')[-1].split('.')[0]
with open (file, 'rU') as read:
reader = csv.reader(read)
for cell in reader:
if cell[0] == '+': continue
if cell[0] == '0': continue
if cell[0] == 'id': continue
accession = cell[1].split('.')[0]
if accession in blacklist: continue
if int(cell[2]) < threshold: continue
# print "got one"
if dictionary.has_key(accession):
dictionary[accession].append(sample)
else: dictionary[accession] = [sample]
return dictionary
#______________________________________________________________________________#
def getGutDB ( file ):
"""reads the Gut DB accessions into a list for comparison"""
gutDBList = []
with open(file) as read:
gutDBList = read.readlines()
gutDBList = [word.strip() for word in gutDBList]
return gutDBList
#______________________________________________________________________________#
def compare ( dictionary, list ):
"""checks the content of a dictionary against a list. Unique items are added to a new list"""
newList = []
for key in dictionary.keys():
# print len(sampleDict[key])
for accession in dictionary[key]:
#print accession,
if accession not in list:
# print accession, key
if accession not in newList: newList.append(accession)
return newList
#______________________________________________________________________________#
def main ():
fileList = createFileList(my_dir, term)
gutDBList = getGutDB(gutDB)
for i in fileList:
sample_dic = readCensusresultFile(i, sampleDict, gutDBList)
for key in sample_dic:
# if len(sample_dic[key]) > 1:
line = key+','
for i in sample_dic[key]:
line = line+i+','
print line
# # for i in sampleDict.keys(): print len(sampleDict[i])
# print len(gutDBList), 'gut orgs'
# newOrgs = compare(sampleDict, gutDBList)
# print len(newOrgs)
# with open('newAcc.txt', 'a') as file:
# for i in newOrgs:
# print i,
# file.write(i+'\n')
#______________________________________________________________________________#
if __name__ == '__main__': main()