From 22bee5cca059b690e3b7309f4238c470c3a442d9 Mon Sep 17 00:00:00 2001 From: sgalkina Date: Mon, 13 Feb 2017 21:06:49 +0100 Subject: [PATCH] feat: match genes ids --- docker-compose.yml | 2 +- id_mapper/graph.py | 19 ++++++++--------- id_mapper/metanetx.py | 2 +- load_chem.py | 25 ++++++++++++++++++++++ load_gene.py | 49 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 load_chem.py create mode 100644 load_gene.py diff --git a/docker-compose.yml b/docker-compose.yml index ec9e4e4..39bf200 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,7 +4,7 @@ db: - 7474:7474 - 7687:7687 volumes: - - 'cleandb:/data' + - '/Users/svegal/neo4j_backup:/data' environment: - NEO4J_AUTH=neo4j/1 web: diff --git a/id_mapper/graph.py b/id_mapper/graph.py index 6182262..7ba5d77 100644 --- a/id_mapper/graph.py +++ b/id_mapper/graph.py @@ -20,23 +20,22 @@ def __str__(self): ) -def insert_pairs(graph, label, pair1, pair2): +def insert_pairs(graph, label, pair1, pair2, organism=None): """Merge nodes to database and create mutual IS relationships between :param graph: Graph :param label: "Metabolite" or "Reaction" :param pair1: node 1 :param pair2: node 2 + :param organism: str :return: """ - nodes = [ - Node( - label, - id=pair.metabolite, - db_name=pair.database - ) - for pair in (pair1, pair2) - ] + nodes = [] + for pair in (pair1, pair2): + kwargs = dict(id=pair.metabolite, db_name=pair.database) + if organism: + kwargs['organism'] = organism + nodes.append(Node(label, **kwargs)) for n in nodes: graph.merge(n) graph.merge(Is(nodes[0], nodes[1])) @@ -55,7 +54,7 @@ def find_match(graph, object_id, db_from, db_to): selector = NodeSelector(graph) result = [] found = False - for labels in ('Metabolite', 'Reaction'): + for labels in ('Metabolite', 'Reaction', 'Gene'): selected = list(selector.select(labels, id=object_id, db_name=db_from)) assert len(selected) <= 1 if selected: diff --git a/id_mapper/metanetx.py b/id_mapper/metanetx.py index 9c94094..2662319 100644 --- a/id_mapper/metanetx.py +++ b/id_mapper/metanetx.py @@ -11,7 +11,7 @@ def make_pairs(line): ready to be uploaded to the graph database """ xref, metanetx_id = line.split()[:2] - xref_db, xref_id = xref.split(':') + xref_db, xref_id = xref.split(':', maxsplit=1) return Pair(xref_id, xref_db), Pair(metanetx_id, 'mnx') diff --git a/load_chem.py b/load_chem.py new file mode 100644 index 0000000..8c4a9f7 --- /dev/null +++ b/load_chem.py @@ -0,0 +1,25 @@ +import os +from id_mapper.metanetx import make_pairs +from id_mapper.graph import insert_pairs +from py2neo import Graph + +from multiprocessing import Pool + +N_PROCESSES = 20 +N_LINES = 100 + +with open('chem_xref_mini_1.tsv') as f: + lines = list(f.readlines()) + + +def process_piece(chunk): + for line in chunk: + x, y = make_pairs(line) + if x.metabolite != y.metabolite: + insert_pairs(graph, 'Metabolite', x, y) + +graph = Graph(host=os.environ['DB_PORT_7687_TCP_ADDR'], password=os.environ['NEO4J_PASSWORD']) + +with Pool(processes=N_PROCESSES) as pool: + pool.map(process_piece, [lines[i:i+N_LINES] for i in range(0, len(lines), N_LINES)]) + diff --git a/load_gene.py b/load_gene.py new file mode 100644 index 0000000..d2b9649 --- /dev/null +++ b/load_gene.py @@ -0,0 +1,49 @@ +import os +from id_mapper.metanetx import make_pairs, Pair +from id_mapper.graph import insert_pairs +from py2neo import Graph +import re + +from multiprocessing import Pool + +N_PROCESSES = 20 +N_LINES = 100 + +with open('ecodata.txt') as f: + lines = list(f.readlines()) + +DATABASES = ['ecogene', 'eck', 'name', 'syn', 'genbank', 'sp', 'blattner', 'asap', 'genobase', 'cg'] + + +def process_piece(chunk): + for line in chunk: + info = dict(zip(DATABASES, line.split('\t'))) + to_delete = [] + for key, value in info.items(): + if value in ('None', 'Null', 'Null\n', 'null', 'null\n'): + to_delete.append(key) + else: + info[key] = info[key].strip("'; ").strip() + info[key] = re.sub('\(\w\.\w\.\)', '', info[key]) + for key in to_delete: + info.pop(key) + info['name'] = [info['name']] + if 'syn' in info: + info['name'].extend(info['syn'].split(', ')) + info.pop('syn') + info['name'] = [i.strip() for i in info['name']] + pair_1 = Pair(info['blattner'], 'blattner') + for key, value in info.items(): + if key != 'blattner': + if key != 'name': + insert_pairs(graph, 'Gene', pair_1, Pair(value, key), organism='ecoli') + else: + for n in value: + insert_pairs(graph, 'Gene', pair_1, Pair(n, key), organism='ecoli') + + +graph = Graph(host=os.environ['DB_PORT_7687_TCP_ADDR'], password=os.environ['NEO4J_PASSWORD']) + +with Pool(processes=N_PROCESSES) as pool: + pool.map(process_piece, [lines[i:i+N_LINES] for i in range(0, len(lines), N_LINES)]) +