forked from cspost/dnatabase
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkn_mysql.py
197 lines (183 loc) · 7.63 KB
/
kn_mysql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""
This sets up the MySQL database for the DNAtabase project. It inits the tables
for the database using our schema and then downloads and populates the gene
tables with information from Ensembl.
"""
import mysql.connector as sql
import urllib2
import shutil
import subprocess
import os
import gzip
HOST = 'knowice.cs.illinois.edu'
PORT = '3308'
USER = 'root'
PASS = 'KnowEnG'
DB = 'dnatabas_cs411'
EDB = "dnatabas_ensembl"
def get_conn(h=HOST, p=PORT, u=USER, ps=PASS, d=DB):
"""Returns a connection to the MySQL database"""
return sql.connect(host=h, port=p, user=u, password=ps, db=d)
def create_tables():
"""Creates the MySQL tables using our schema"""
conn = get_conn()
cursor = conn.cursor()
ut = ( "CREATE TABLE IF NOT EXISTS user ("
"firstname TEXT NOT NULL, "
"lastname TEXT NOT NULL, "
"email VARCHAR(50) NOT NULL, "
"institution TEXT NOT NULL, "
"PRIMARY KEY (email) );")
cursor.execute(ut)
conn.commit()
rt = ( "CREATE TABLE IF NOT EXISTS researches ("
"email VARCHAR(50) NOT NULL, "
"symbol VARCHAR(15) NOT NULL, "
"PRIMARY KEY (email, symbol),"
"FOREIGN KEY (email) REFERENCES user (email) "
"ON DELETE CASCADE ON UPDATE CASCADE "
");")
cursor.execute(rt)
conn.commit()
gst = ( "CREATE TABLE IF NOT EXISTS genesymbol ("
"symbol VARCHAR(15) NOT NULL, "
"name TEXT, "
"species VARCHAR(255) DEFAULT 'homo_sapiens', "
"url VARCHAR(255) DEFAULT 'www.genecards.org', "
"PRIMARY KEY (symbol) );")
cursor.execute(gst)
conn.commit()
gt = ( "CREATE TABLE IF NOT EXISTS gene ("
"gid VARCHAR(80) NOT NULL, "
"source VARCHAR(255) NOT NULL, "
"symbol VARCHAR(15) NOT NULL, "
"PRIMARY KEY (gid, source, symbol) );")
cursor.execute(gt)
conn.commit()
conn.close()
def download_ensembl():
"""Downloads the gene information data from ensembl"""
files = ['homo_sapiens_core_82_38.sql.gz', 'transcript.txt.gz',
'external_db.txt.gz', 'gene.txt.gz', 'object_xref.txt.gz',
'translation.txt.gz', 'xref.txt.gz']
url = "ftp://ftp.ensembl.org/pub/release-82/mysql/homo_sapiens_core_82_38/"
for fl in files:
print("Downloading: " + url + fl)
req = urllib2.urlopen(url + fl)
fl = '../data/mysql/' + fl
with open(fl, 'w') as out:
shutil.copyfileobj(req, out)
with gzip.open(fl) as f_in:
with open(fl[:-3], 'w') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(fl)
def import_ensembl():
"""Imports the ensembl gene information"""
#Variables
import_flags = '--delete'
tablefile = '../data/mysql/*.txt'
sqlfile = "../data/mysql/homo_sapiens_core_82_38.sql"
#Import data into temporary database
cmd = [ 'mysql', '-u', USER, '-h', HOST, '--port', PORT,
'--password=' + PASS, EDB, '<', sqlfile]
subprocess.call(' '.join(cmd), shell=True)
cmd = [ 'mysqlimport', '-u', USER, '-h', HOST, '--port', PORT,
'--password=' + PASS, import_flags, '--fields_escaped_by=\\\\',
EDB, '-L', tablefile]
subprocess.call(' '.join(cmd), shell=True)
def combine_tables():
"""Combine ensembl tables and query the relevant info into a new one"""
steps = ['transcript', 'translation']
conn = get_conn(HOST, PORT, USER, PASS, EDB)
cursor = conn.cursor()
cmd = ("CREATE TABLE IF NOT EXISTS all_mappings " + get_insert_cmd('gene')
+ ";")
cursor.execute(cmd)
conn.commit()
for step in steps:
cmd = ("INSERT INTO all_mappings " + get_insert_cmd(step) + ";")
cursor.execute(cmd)
conn.commit()
def populate_gene_tables():
conn = get_conn(HOST, PORT, USER, PASS, EDB)
cursor = conn.cursor()
print("Creating genesymbol table")
cmd = ( "INSERT INTO dnatabas_cs411.genesymbol "
"SELECT DISTINCT all_mappings.display_label AS symbol, "
"SUBSTRING_INDEX(all_mappings.description, '[', 1) AS name, "
"'homo_sapiens' AS species, "
"CONCAT('http://www.genecards.org/cgi-bin/carddisp.pl?gene=', "
"all_mappings.display_label) AS url "
"FROM dnatabas_ensembl.all_mappings "
"WHERE all_mappings.db_display_name = 'HGNC Symbol'"
"GROUP BY symbol HAVING COUNT(symbol) = 1;")
cursor.execute(cmd)
conn.commit()
print("Creating stable2symbol table")
cmd = ( "CREATE TEMPORARY TABLE IF NOT EXISTS "
"dnatabas_ensembl.stable2symbol AS ("
"SELECT DISTINCT all_mappings.stable_id, "
"all_mappings.display_label "
"FROM dnatabas_ensembl.all_mappings "
"WHERE all_mappings.db_display_name = 'HGNC Symbol');")
cursor.execute(cmd)
conn.commit()
print("Creating gene table")
cmd = ( "INSERT INTO dnatabas_cs411.gene "
"SELECT DISTINCT all_mappings.dbprimary_acc as gid, "
"all_mappings.db_name as source, "
"stable2symbol.display_label AS symbol "
"FROM all_mappings, stable2symbol "
"WHERE all_mappings.stable_id = stable2symbol.stable_id;")
cursor.execute(cmd)
conn.commit()
def get_insert_cmd(step):
"""Returns the command to be used with an insert for the provided step."""
if step == 'gene':
cmd = ("SELECT DISTINCT xref.dbprimary_acc, xref.display_label, "
"external_db.db_name, external_db.db_display_name, "
"gene.description, gene.stable_id "
"FROM xref INNER JOIN external_db "
"ON xref.external_db_id = external_db.external_db_id "
"INNER JOIN object_xref "
"ON xref.xref_id = object_xref.xref_id "
"INNER JOIN gene "
"ON object_xref.ensembl_id = gene.gene_id "
"WHERE object_xref.ensembl_object_type = 'Gene'")
elif step == 'transcript':
cmd = ("SELECT DISTINCT xref.dbprimary_acc, xref.display_label, "
"external_db.db_name, external_db.db_display_name, "
"gene.description, gene.stable_id "
"FROM xref INNER JOIN external_db "
"ON xref.external_db_id = external_db.external_db_id "
"INNER JOIN object_xref "
"ON xref.xref_id = object_xref.xref_id "
"INNER JOIN transcript "
"ON object_xref.ensembl_id = transcript.transcript_id "
"INNER JOIN gene "
"ON transcript.gene_id = gene.gene_id "
"WHERE object_xref.ensembl_object_type = 'Transcript'")
elif step == 'translation':
cmd = ("SELECT DISTINCT xref.dbprimary_acc, xref.display_label, "
"external_db.db_name, external_db.db_display_name, "
"gene.description, gene.stable_id "
"FROM xref INNER JOIN external_db "
"ON xref.external_db_id = external_db.external_db_id "
"INNER JOIN object_xref "
"ON xref.xref_id = object_xref.xref_id "
"INNER JOIN translation "
"ON object_xref.ensembl_id = translation.translation_id "
"INNER JOIN transcript "
"ON translation.transcript_id = transcript.transcript_id "
"INNER JOIN gene "
"ON transcript.gene_id = gene.gene_id "
"WHERE object_xref.ensembl_object_type = 'Translation'")
else:
cmd = ''
return cmd
if __name__ == "__main__":
#create_tables()
#download_ensembl()
import_ensembl()
combine_tables()
populate_gene_tables()