Skip to content

Commit

Permalink
feat: match genes ids
Browse files Browse the repository at this point in the history
  • Loading branch information
sgalkina committed Feb 13, 2017
1 parent ed7adc3 commit 22bee5c
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 12 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ db:
- 7474:7474
- 7687:7687
volumes:
- 'cleandb:/data'
- '/Users/svegal/neo4j_backup:/data'
environment:
- NEO4J_AUTH=neo4j/1
web:
Expand Down
19 changes: 9 additions & 10 deletions id_mapper/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,22 @@ def __str__(self):
)


def insert_pairs(graph, label, pair1, pair2):
def insert_pairs(graph, label, pair1, pair2, organism=None):
"""Merge nodes to database and create mutual IS relationships between
:param graph: Graph
:param label: "Metabolite" or "Reaction"
:param pair1: node 1
:param pair2: node 2
:param organism: str
:return:
"""
nodes = [
Node(
label,
id=pair.metabolite,
db_name=pair.database
)
for pair in (pair1, pair2)
]
nodes = []
for pair in (pair1, pair2):
kwargs = dict(id=pair.metabolite, db_name=pair.database)
if organism:
kwargs['organism'] = organism
nodes.append(Node(label, **kwargs))
for n in nodes:
graph.merge(n)
graph.merge(Is(nodes[0], nodes[1]))
Expand All @@ -55,7 +54,7 @@ def find_match(graph, object_id, db_from, db_to):
selector = NodeSelector(graph)
result = []
found = False
for labels in ('Metabolite', 'Reaction'):
for labels in ('Metabolite', 'Reaction', 'Gene'):
selected = list(selector.select(labels, id=object_id, db_name=db_from))
assert len(selected) <= 1
if selected:
Expand Down
2 changes: 1 addition & 1 deletion id_mapper/metanetx.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def make_pairs(line):
ready to be uploaded to the graph database
"""
xref, metanetx_id = line.split()[:2]
xref_db, xref_id = xref.split(':')
xref_db, xref_id = xref.split(':', maxsplit=1)
return Pair(xref_id, xref_db), Pair(metanetx_id, 'mnx')


Expand Down
25 changes: 25 additions & 0 deletions load_chem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
from id_mapper.metanetx import make_pairs
from id_mapper.graph import insert_pairs
from py2neo import Graph

from multiprocessing import Pool

N_PROCESSES = 20
N_LINES = 100

with open('chem_xref_mini_1.tsv') as f:
lines = list(f.readlines())


def process_piece(chunk):
for line in chunk:
x, y = make_pairs(line)
if x.metabolite != y.metabolite:
insert_pairs(graph, 'Metabolite', x, y)

graph = Graph(host=os.environ['DB_PORT_7687_TCP_ADDR'], password=os.environ['NEO4J_PASSWORD'])

with Pool(processes=N_PROCESSES) as pool:
pool.map(process_piece, [lines[i:i+N_LINES] for i in range(0, len(lines), N_LINES)])

49 changes: 49 additions & 0 deletions load_gene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
from id_mapper.metanetx import make_pairs, Pair
from id_mapper.graph import insert_pairs
from py2neo import Graph
import re

from multiprocessing import Pool

N_PROCESSES = 20
N_LINES = 100

with open('ecodata.txt') as f:
lines = list(f.readlines())

DATABASES = ['ecogene', 'eck', 'name', 'syn', 'genbank', 'sp', 'blattner', 'asap', 'genobase', 'cg']


def process_piece(chunk):
for line in chunk:
info = dict(zip(DATABASES, line.split('\t')))
to_delete = []
for key, value in info.items():
if value in ('None', 'Null', 'Null\n', 'null', 'null\n'):
to_delete.append(key)
else:
info[key] = info[key].strip("'; ").strip()
info[key] = re.sub('\(\w\.\w\.\)', '', info[key])
for key in to_delete:
info.pop(key)
info['name'] = [info['name']]
if 'syn' in info:
info['name'].extend(info['syn'].split(', '))
info.pop('syn')
info['name'] = [i.strip() for i in info['name']]
pair_1 = Pair(info['blattner'], 'blattner')
for key, value in info.items():
if key != 'blattner':
if key != 'name':
insert_pairs(graph, 'Gene', pair_1, Pair(value, key), organism='ecoli')
else:
for n in value:
insert_pairs(graph, 'Gene', pair_1, Pair(n, key), organism='ecoli')


graph = Graph(host=os.environ['DB_PORT_7687_TCP_ADDR'], password=os.environ['NEO4J_PASSWORD'])

with Pool(processes=N_PROCESSES) as pool:
pool.map(process_piece, [lines[i:i+N_LINES] for i in range(0, len(lines), N_LINES)])

0 comments on commit 22bee5c

Please sign in to comment.