Skip to content

Commit

Permalink
Support propagate counts up optional relationships, if specified by u…
Browse files Browse the repository at this point in the history
…ser.

#117
  • Loading branch information
dvklopfenstein committed Jul 26, 2019
1 parent 553948b commit 230fe33
Show file tree
Hide file tree
Showing 13 changed files with 146 additions and 63 deletions.
92 changes: 72 additions & 20 deletions goatools/anno/annoreader_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
from goatools.evidence_codes import EvidenceCodes
from goatools.anno.opts import AnnoOptions
from goatools.godag.consts import NAMESPACE2NS
from goatools.gosubdag.go_tasks import get_go2parents_go2obj

__copyright__ = "Copyright (C) 2016-2019, DV Klopfenstein, H Tang. All rights reserved."
__author__ = "DV Klopfenstein"


# pylint: disable=too-many-instance-attributes
class AnnoReaderBase(object):
"""Reads a Gene Association File. Returns a Python object."""
# pylint: disable=broad-except,line-too-long,too-many-instance-attributes
Expand All @@ -28,7 +30,6 @@ class AnnoReaderBase(object):

exp_nss = set(['BP', 'MF', 'CC'])

# pylint: disable=too-many-instance-attributes
def __init__(self, name, filename=None, **kws):
# kws: allow_missing_symbol
self.name = name # name is one of valid_formats
Expand Down Expand Up @@ -96,19 +97,6 @@ def get_id2gos_nss(self, **kws):
"""Return all associations in a dict, id2gos, regardless of namespace"""
return self._get_id2gos(self.associations, **kws)

#### def get_id2gos(self, namespace='BP', **kws):
#### """Return associations from specified namespace in a dict, id2gos"""
#### # pylint: disable=superfluous-parens
#### if self.has_ns():
#### assoc = [nt for nt in self.associations if nt.NS == namespace]
#### id2gos = self._get_id2gos(assoc, **kws)
#### print('{N} IDs in loaded association branch, {NS}'.format(N=len(id2gos), NS=namespace))
#### return id2gos
#### print('**ERROR get_id2gos: GODAG NOT LOADED. IGNORING namespace({NS})'.format(NS=namespace))
#### id2gos = self._get_id2gos(self.associations, **kws)
#### print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace))
#### return id2gos

def get_id2gos(self, namespace=None, **kws):
"""Return associations from specified namespace in a dict, id2gos"""
# pylint: disable=superfluous-parens
Expand Down Expand Up @@ -150,17 +138,28 @@ def has_ns(self):
"""Return True if namespace field, NS exists on annotation namedtuples"""
return hasattr(next(iter(self.associations)), 'NS')

def _get_id2gos(self, associations, **kws):
"""Return given associations in a dict, id2gos"""
def _get_id2gos(self, ntannos_usr, propagate_counts=False, relationships=None, prt=sys.stdout, **kws):
"""Return given ntannos_usr in a dict, id2gos"""
options = AnnoOptions(self.evobj, **kws)
# Default reduction is to remove. For all options, see goatools/anno/opts.py:
# * Evidence_Code == ND -> No biological data No biological Data available
# * Qualifiers contain NOT
assc = self.reduce_annotations(associations, options)
a2bs = self.get_dbid2goids(assc) if options.b_geneid2gos else self.get_goid2dbids(assc)
ntannos_m = self.reduce_annotations(ntannos_usr, options)
dbid2goids = self.get_dbid2goids(ntannos_m, propagate_counts, relationships, prt)
if options.b_geneid2gos:
return dbid2goids
# if not a2bs:
# raise RuntimeError('**ERROR: NO ASSOCATIONS FOUND: {FILE}'.format(FILE=self.filename))
return a2bs
return self._get_goid2dbids(dbid2goids)

@staticmethod
def _get_goid2dbids(dbid2goids):
"""Return dict of GO ID keys and a set of gene products as values"""
goid2dbids = cx.defaultdict(set)
for dbid, goids in dbid2goids.items():
for goid in goids:
goid2dbids[goid].add(dbid)
return dict(goid2dbids)

def _get_namespaces(self, nts):
"""Get the set of namespaces seen in the namedtuples."""
Expand Down Expand Up @@ -197,13 +196,66 @@ def reduce_annotations(self, annotations, options):
return [nt for nt in annotations if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code)]

@staticmethod
def get_dbid2goids(associations):
def update_association(assc_goidsets, go2ancestors, prt=sys.stdout):
"""Update the GO sets in assc_gene2gos to include all GO ancestors"""
goids_avail = set(go2ancestors)
# assc_gos is assc_gene2gos.values()
for assc_goids_cur in assc_goidsets:
parents = set()
for goid in assc_goids_cur.intersection(goids_avail):
parents.update(go2ancestors[goid])
assc_goids_cur.update(parents)

def _get_go2ancestors(self, goids_assoc_usr, relationships, prt=sys.stdout):
"""Return go2ancestors (set of parent GO IDs) for all GO ID keys in go2obj."""
assert self.godag is not None
_godag = self.godag
# Get GO IDs in annotations that are in GO DAG
goids_avail = set(_godag)
self._rpt_goids_notfound(goids_assoc_usr, goids_avail)
goids_assoc_cur = goids_assoc_usr.intersection(goids_avail)
# Get GO Term for each current GO ID in the annotations
_go2obj_assc = {go:_godag[go] for go in goids_assoc_cur}
go2ancestors = get_go2parents_go2obj(_go2obj_assc, relationships, prt)
if prt:
prt.write('{N} GO IDs -> {M} go2ancestors\n'.format(
N=len(goids_avail), M=len(go2ancestors)))
return go2ancestors

@staticmethod
def _rpt_goids_notfound(goids_assoc_all, goids_avail):
"""Report the number of GO IDs in the association, but not in the GODAG"""
goids_missing = goids_assoc_all.difference(goids_avail)
if goids_missing:
print("{N} GO IDs NOT FOUND IN ASSOCIATION: {GOs}".format(
N=len(goids_missing), GOs=" ".join(goids_missing)))

def get_dbid2goids(self, ntannos, propagate_counts=False, relationships=None, prt=sys.stdout):
"""Return gene2go data for user-specified taxids."""
if propagate_counts:
return self._get_dbid2goids_p1(ntannos, relationships, prt)
return self._get_dbid2goids_p0(ntannos)

@staticmethod
def _get_dbid2goids_p0(associations):
"""Return gene2goids with annotations as-is (propagate_counts == False)"""
id2gos = cx.defaultdict(set)
for ntd in associations:
id2gos[ntd.DB_ID].add(ntd.GO_ID)
return dict(id2gos)

def _get_dbid2goids_p1(self, ntannos, relationships=None, prt=sys.stdout):
"""Return gene2goids with propagate_counts == True"""
id2gos = cx.defaultdict(set)
goids_annos = set(nt.GO_ID for nt in ntannos)
go2ancestors = self._get_go2ancestors(goids_annos, relationships, prt)
for ntd in ntannos:
goid = ntd.GO_ID
goids = id2gos[ntd.DB_ID]
goids.add(goid)
goids.update(go2ancestors[goid])
return dict(id2gos)

@staticmethod
def get_goid2dbids(associations):
"""Return gene2go data for user-specified taxids."""
Expand Down
48 changes: 22 additions & 26 deletions goatools/cli/find_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from goatools.rpt.prtfmt import PrtFmt
from goatools.semantic import TermCounts
from goatools.wr_tbl import prt_tsv_sections
## from goatools.godag.consts import RELATIONSHIP_SET
from goatools.godag.consts import RELATIONSHIP_SET
from goatools.godag.consts import chk_relationships
from goatools.godag.prtfncs import GoeaPrintFunctions
from goatools.anno.factory import get_anno_desc
Expand All @@ -57,9 +57,6 @@ class GoeaCliArgs(object):

def __init__(self):
self.args = self._init_args()
#### print('BBBBBBBBBBBBBB', self.args)
#### print('BBBBBBBBBBBBBB', self.args.relationship)
#### print('BBBBBBBBBBBBBB', self.args.relationships)

def _init_args(self):
"""Get enrichment arg parser."""
Expand Down Expand Up @@ -196,6 +193,8 @@ def _adjust_relationships(args):
# --relationships=part_of -> relationships=['part_of']
# --relationships=part_of,regulates -> relationships=['part_of,regulates']
# --relationships=part_of regulates -> NOT VALID
if args.relationship:
args.relationships = RELATIONSHIP_SET
if args.relationships is not None:
if len(args.relationships) == 1 and ',' in args.relationships[0]:
args.relationships = args.relationships[0].split(',')
Expand All @@ -215,30 +214,21 @@ def __init__(self, args):
godag_optional_attrs = self._get_optional_attrs()
self.godag = GODag(obo_file=self.args.obo, optional_attrs=godag_optional_attrs)
# print('ARGS GoeaCliFnc ', self.args)
# GET Gene2GoReader, GafReader, GpadReader, or IdToGosReader
# GET: Gene2GoReader, GafReader, GpadReader, or IdToGosReader
self.objanno = self._get_objanno(self.args.filenames[2])
_ns2assoc = self.objanno.get_ns2assc(**self._get_anno_kws())
_study, _pop = self.rd_files(*self.args.filenames[:2])
if not self.args.compare: # sanity check
if not self.args.compare:
# Compare population and study gene product sets
self.chk_genes(_study, _pop, self.objanno.associations)
self.methods = self.args.method.split(",")
self.itemid2name = self._init_itemid2name()
# Get GOEnrichmentStudyNS
self.objgoeans = self._init_objgoeans(_pop, _ns2assoc)
self.objgoeans = self._init_objgoeans(_pop)
# Run GOEA
self.results_all = self.objgoeans.run_study(_study)
# Prepare for grouping, if user-specified. Create GroupItems
self.prepgrp = GroupItems(self, self.godag.version) if self.sections else None

def _get_anno_kws(self):
"""Return keyword options to obtain id2gos"""
kws = {}
if self.args.ev_inc is not None:
kws['ev_include'] = set(self.args.ev_inc.split(','))
if self.args.ev_exc is not None:
kws['ev_exclude'] = set(self.args.ev_exc.split(','))
return kws

def _get_objanno(self, assoc_fn):
"""Get an annotation object"""
# Determine annotation file format from filename, if possible
Expand All @@ -260,11 +250,9 @@ def _get_ns(self):

def _get_kws_objanno(self, anno_type):
"""Get keyword-args for creating an Annotation object"""
kws = {'namespaces': self._get_ns()}
kws = {'namespaces': self._get_ns(), 'godag': self.godag}
if anno_type == 'gene2go':
kws['taxid'] = self.args.taxid
if anno_type in {'gpad', 'id2gos'}:
kws['godag'] = self.godag
return kws

def _init_itemid2name(self):
Expand Down Expand Up @@ -325,26 +313,34 @@ def get_results(self):
"""Given all GOEA results, return the significant results (< pval)."""
return self.get_results_sig() if self.args.pval != -1.0 else self.results_all

def _init_objgoeans(self, pop, ns2assoc):
def _init_objgoeans(self, pop):
"""Run gene ontology enrichment analysis (GOEA)."""
propagate_counts = not self.args.no_propagate_counts
ns2assoc = self.objanno.get_ns2assc(**self._get_anno_kws())
return GOEnrichmentStudyNS(pop, ns2assoc, self.godag,
propagate_counts=propagate_counts,
relationships=False,
propagate_counts=not self.args.no_propagate_counts,
relationships=self.args.relationships,
alpha=self.args.alpha,
pvalcalc=self.args.pvalcalc,
methods=self.methods)
def _get_anno_kws(self):
"""Return keyword options to obtain id2gos"""
kws = {}
if self.args.ev_inc is not None:
kws['ev_include'] = set(self.args.ev_inc.split(','))
if self.args.ev_exc is not None:
kws['ev_exclude'] = set(self.args.ev_exc.split(','))
return kws

def chk_genes(self, study, pop, ntsassoc=None):
"""Check gene sets."""
"""Compare population and study gene product sets"""
if len(pop) < len(study):
exit("\nERROR: The study file contains more elements than the population file. "
"Please check that the study file is a subset of the population file.\n")
# check the fraction of genomic ids that overlap between study and population
overlap = self.get_overlap(study, pop)
if overlap < 0.95:
sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in "
"the population background.\n\n".format(overlap))
"the population background.\n\n".format(overlap))
if overlap <= self.args.min_overlap:
exit("\nERROR: only {} of genes/proteins in the study are found in the "
"background population. Please check.\n".format(overlap))
Expand Down
2 changes: 1 addition & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ pytest:
#py.test -v tests/
#make test_scripts
python3 -m pytest -v tests | tee pytest.log
make chk_parsers
# make chk_parsers
# py.test tests/ --log-file=pytest.log
grep FAIL pytest.log

Expand Down
Binary file modified tests/data/i126/viral_r1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/data/i126/viral_r_partof.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/data/i126/viral_r_regp.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/data/i126/viral_r_regrn.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/data/i126/viral_r_regrp.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/data/i126/viral_r_regrpn.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 5 additions & 1 deletion tests/test_anno_allfmts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
INC_GOOD = set(EvidenceCodes.code2nt.keys()).difference({'IEA'})


# pylint: disable=superfluous-parens
def test_anno_read():
"""Test all annotation formats"""
godag = get_godag(os.path.join(REPO, 'go-basic.obo'))
Expand Down Expand Up @@ -103,6 +104,7 @@ def _run_get_id2gos2(annoobjs):
id2gos = obj.get_id2gos('all')
assert id2gos, 'NO ANNOTATIONS FOUND'
assert id2gos == obj.get_id2gos_nss()
# pylint: disable=line-too-long
assert obj.get_id2gos('all', ev_include={'IEA'}) == obj.get_id2gos_nss(ev_include={'IEA'})
idx += 1

Expand Down Expand Up @@ -219,13 +221,15 @@ def _chk_namespaces(obj, namespaces):
assert not hasattr(next(iter(obj.associations)), 'NS')
return
for nta in obj.associations:
assert nta.NS in namespaces
assert nta.NS in namespaces, 'nta.NS({NS}) not in ({NSs})\n{NT}'.format(
NSs=namespaces, NS=nta.NS, NT=nta)

def _get_num_annos(id2gos):
"""Return the number of annotations found in id2gos"""
num = 0
for gos in id2gos.values():
num += len(gos)
return num

if __name__ == '__main__':
test_anno_read()
2 changes: 2 additions & 0 deletions tests/test_cmds_find_enrichment_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def _get_cmds():
'python3 scripts/find_enrichment.py data/study data/population data/association --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_id2gos.xlsx',
'python3 scripts/find_enrichment.py ids_stu_gaf.txt ids_pop_gaf.txt goa_human.gaf --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gaf.xlsx',
'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx',
'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx -r',
'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx --relationships=regulates,negatively_regulates,positively_regulates',
'python3 scripts/find_enrichment.py ids_stu_gene2go_9606.txt ids_pop_gene2go_9606.txt gene2go --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gene2go_9606.xlsx',
'python3 scripts/find_enrichment.py ids_stu_gene2go_10090.txt ids_pop_gene2go_10090.txt gene2go --taxid=10090 --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gene2go_10090.xlsx',
'python3 scripts/find_enrichment.py ids_stu_gpad.txt ids_pop_gpad.txt goa_human.gpad --ev_exc=IEA --pval=0.05 --method=fdr_bh --pval_field=fdr_bh --outfile=results_gpad.xlsx',
Expand Down
26 changes: 20 additions & 6 deletions tests/test_propagate_counts_w_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,30 @@ def test_pc_w_rels(prt=sys.stdout):
file_obo = os.path.join(REPO, "go-basic.obo")
godag_r0 = get_godag(file_obo, prt, loading_bar=None)
godag_r1 = get_godag(file_obo, prt, loading_bar=None, optional_attrs=['relationship'])

# pylint: disable=line-too-long
# Check that relationships are used in propgate counts
results_r0 = _get_results(godag_r1, propagate_counts=True, relationships=False, prt=prt)
results_r1 = _get_results(godag_r1, propagate_counts=True, relationships=True, prt=prt)
_chk_results(results_r0, results_r1, prt)

def _chk_results(results_r0, results_r1, prt):
"""Test propagate_counts up relationships as well as parent-child links."""
prt.write('TBD: Compare results\n')
results_rr = _get_results(godag_r1, propagate_counts=True, relationships={'regulates', 'negatively_regulates', 'positively_regulates'}, prt=prt)
results_rp = _get_results(godag_r1, propagate_counts=True, relationships={'part_of'}, prt=prt)
prt.write('{N} results with r0\n'.format(N=len(results_r0)))
prt.write('{N} results with r1\n'.format(N=len(results_r1)))
pass
prt.write('{N} results with all regulates\n'.format(N=len(results_rr)))
prt.write('{N} results with part_of\n'.format(N=len(results_rp)))
assert len(results_r1) > len(results_rr)
assert len(results_rr) > len(results_rp)
assert len(results_rp) > len(results_r0)

# TBD: Add warning message that relationships are ignored
# Check that relationships are ignored in propagate counts if they were not loaded
_get_results(godag_r0, propagate_counts=True, relationships=False, prt=prt)
## results_r0b = _get_results(godag_r0, propagate_counts=True, relationships=True, prt=prt)
## results_r0c = _get_results(godag_r0, propagate_counts=True, relationships={'regulates', 'negatively_regulates', 'positively_regulates'}, prt=prt)
## results_r0d = _get_results(godag_r0, propagate_counts=True, relationships={'part_of'}, prt=prt)
## assert len(results_r0a) == len(results_r0b)
## assert len(results_r0b) == len(results_r0c)
## assert len(results_r0c) == len(results_r0d)

def _get_results(godag, propagate_counts, relationships, prt=sys.stdout):
"""Run a GOEA. Return results"""
Expand Down
Loading

0 comments on commit 230fe33

Please sign in to comment.