Skip to content

Commit

Permalink
Merge pull request #39 from NAL-i5K/issue656
Browse files Browse the repository at this point in the history
issue656
  • Loading branch information
mpoelchau authored Jun 15, 2018
2 parents a8eedcf + e7c7b36 commit 78cb8ed
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 40 deletions.
66 changes: 38 additions & 28 deletions gff3tool/bin/gff3_to_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,18 @@
def complement(seq):
return seq.translate(COMPLEMENT_TRANS)

def get_subseq(gff, line):
def get_subseq(gff, line, embedded_fasta=False):
# it would give positive strand out as default, unless the strand information is given as '-'
try:
start = line['start']-1
except:
pass
end = line['end']
try:
string = gff.fasta_external[line['seqid']]['seq'][start:end]
if gff.fasta_external and not embedded_fasta:
string = gff.fasta_external[line['seqid']]['seq'][start:end]
else:
string = gff.fasta_embedded[line['seqid']]['seq'][start:end]
except:
if line['type'] != "CDS":
print('WARNING [SeqID/Start/End] Missing SeqID, or Start/End is not a valid integer.\n\t\t- Line {0:s}: {1:s}'.format(str(line['line_index']+1), line['line_raw']))
Expand Down Expand Up @@ -58,7 +61,7 @@ def translator(seq):
peptide += amino_acid
return peptide

def splicer(gff, ftype, dline, stype):
def splicer(gff, ftype, dline, stype, embedded_fasta=False):
seq=dict()
segments_Set = set()
sort_seg = []
Expand Down Expand Up @@ -142,7 +145,7 @@ def splicer(gff, ftype, dline, stype):
s['start'] = start
s['end'] = end
s['phase'] = 0
tmpseq = tmpseq + get_subseq(gff, s)
tmpseq = tmpseq + get_subseq(gff, s, embedded_fasta)
count += 1
seq[defline] = tmpseq
if len(segments_Set) != 0:
Expand Down Expand Up @@ -257,7 +260,7 @@ def splicer(gff, ftype, dline, stype):
s['start'] = start
s['end'] = end
s['phase'] = 0
tmpseq = tmpseq + get_subseq(gff, s)
tmpseq = tmpseq + get_subseq(gff, s, embedded_fasta)
count += 1

seq[defline] = tmpseq
Expand Down Expand Up @@ -285,7 +288,7 @@ def splicer(gff, ftype, dline, stype):

return seq

def extract_start_end(gff, stype, dline):
def extract_start_end(gff, stype, dline, embedded_fasta=False):
'''Extract sequences for a feature only use the Start and End information. The relationship between parent and children would be ignored.'''
seq=dict()
roots = []
Expand Down Expand Up @@ -318,7 +321,7 @@ def extract_start_end(gff, stype, dline):
defline = '>{0:s}:{1:d}..{2:d}:{3:s}|genomic_sequence({4:s})|Parent={5:s}|ID={6:s}|Name={7:s}'.format(child['seqid'], child['start'], child['end'], child['strand'], child['type'], rid, cid, cname)
except:
pass
seq[defline] = get_subseq(gff, child)
seq[defline] = get_subseq(gff, child, embedded_fasta)
elif stype == 'gene':
for root in roots:
if root['type'] == 'gene' or root['type'] == 'pseudogene':
Expand All @@ -331,7 +334,7 @@ def extract_start_end(gff, stype, dline):
defline='>{0:s}'.format(rid)
if dline == 'complete':
defline = '>{0:s}:{1:d}..{2:d}:{3:s}|{6:s}|ID={4:s}|Name={5:s}'.format(root['seqid'], root['start'], root['end'], root['strand'], rid, rname, root['type'])
seq[defline] = get_subseq(gff, root)
seq[defline] = get_subseq(gff, root, embedded_fasta)
elif root['type'] == "":
print('WARNING [Missing feature type] Program failed.\n\t\t- Line {0:s}: {1:s}'.format(str(root['line_index']+1), root['line_raw']))
elif stype == 'exon':
Expand All @@ -357,7 +360,7 @@ def extract_start_end(gff, stype, dline):
if dline == 'complete':
defline = '>{0:s}:{1:d}..{2:d}:{3:s}|{4:s}|Parent={5:s}|ID={6:s}|Name={7:s}'.format(exon['seqid'], exon['start'], exon['end'], exon['strand'], exon['type'], pid, eid, ename)

seq[defline] = get_subseq(gff, exon)
seq[defline] = get_subseq(gff, exon, embedded_fasta)
except:
print('WARNING [Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'.format(str(exon['line_index']+1), exon['line_raw']))
else:
Expand All @@ -384,20 +387,20 @@ def extract_start_end(gff, stype, dline):
defline = '>{0:s}:{1:d}..{2:d}:{3:s}|{4:s}|Parent={5:s}|ID={6:s}|Name={7:s}'.format(user_defined['seqid'], user_defined['start'], user_defined['end'], user_defined['strand'], user_defined['type'], pid, uid, uname)
else:
defline = '>{0:s}:{1:d}..{2:d}:{3:s}|{4:s}|ID={5:s}|Name={6:s}'.format(user_defined['seqid'], user_defined['start'], user_defined['end'], user_defined['strand'], user_defined['type'], uid, uname)
seq[defline] = get_subseq(gff, user_defined)
seq[defline] = get_subseq(gff, user_defined, embedded_fasta)
except:
print('WARNING [Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'.format(str(user_defined['line_index']+1), user_defined['line_raw']))

return seq

def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=None, qc=True, output_prefix=None, logger=None):
def main(gff_file=None, fasta_file=None, embedded_fasta=False, stype=None, user_defined=None, dline=None, qc=True, output_prefix=None, logger=None):
stderr_handler = logging.StreamHandler()
stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
logger_null = logging.getLogger(__name__+'null')
null_handler = logging.NullHandler()
logger_null.addHandler(null_handler)

if not gff_file or not fasta_file or not stype:
if not gff_file or (not fasta_file and not embedded_fasta) or not stype:
print('Gff file, fasta file, and type of extracted sequences need to be specified')
sys.exit(1)
type_set=['gene','exon','pre_trans', 'trans', 'cds', 'pep', 'all', 'user_defined']
Expand Down Expand Up @@ -430,6 +433,9 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
if qc:
initial_phase = False
gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger)
if embedded_fasta and len(gff.fasta_embedded) == 0:
logger.error('There is no embedded fasta in the GFF3 file.')
sys.exit(1)
logger.info('Checking errors...')
gff.check_parent_boundary()
gff.check_phase(initial_phase)
Expand All @@ -442,7 +448,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
if t:
error_set.extend(t)

if len(error_set):
if error_set and len(error_set):
escaped_error = ['Esf0012','Esf0033']
eSet = list()
for e in error_set:
Expand All @@ -456,6 +462,8 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
print e['ID'], e['eCode'], tag
else:
gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger_null)
if embedded_fasta and len(gff.fasta_embedded) == 0:
logger.error('There is no embedded fasta in the GFF3 file.')

logger.info('Extract sequences for {0:s}...'.format(stype))
seq=dict()
Expand All @@ -469,7 +477,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No

tmp_stype = 'pre_trans'
logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
seq = extract_start_end(gff, tmp_stype, dline)
seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
if len(seq):
fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
report_fh = open(fname, 'wb')
Expand All @@ -481,7 +489,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
seq=dict()
tmp_stype = 'gene'
logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
seq = extract_start_end(gff, tmp_stype, dline)
seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
if len(seq):
fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
report_fh = open(fname, 'wb')
Expand All @@ -493,7 +501,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
seq=dict()
tmp_stype = 'exon'
logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
seq = extract_start_end(gff, tmp_stype, dline)
seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
if len(seq):
fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
report_fh = open(fname, 'wb')
Expand All @@ -506,7 +514,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
tmp_stype = 'trans'
feature_type = ['exon', 'pseudogenic_exon']
logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
seq = splicer(gff, feature_type, dline, stype)
seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
if len(seq):
fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
report_fh = open(fname, 'wb')
Expand All @@ -519,7 +527,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
tmp_stype = 'cds'
feature_type = ['CDS']
logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
seq = splicer(gff, feature_type, dline, stype)
seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
if len(seq):
fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
report_fh = open(fname, 'wb')
Expand All @@ -532,7 +540,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
tmp_stype = 'pep'
feature_type = ['CDS']
logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
tmpseq = splicer(gff, feature_type, dline, tmp_stype)
tmpseq = splicer(gff, feature_type, dline, tmp_stype, embedded_fasta)
for k,v in tmpseq.items():
k = k.replace("|mRNA(CDS)|", "|peptide|")
v = translator(v)
Expand All @@ -546,7 +554,7 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No
report_fh.write('{0:s}\n{1:s}\n'.format(k,v))
elif stype == 'user_defined':
feature_type = [user_defined[0],user_defined[1]]
seq = splicer(gff, feature_type, dline, stype)
seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
if len(seq):
logger.info('Print out extracted sequences: {0:s}_{1:s}.fa...'.format(output_prefix, stype))
for k,v in seq.items():
Expand All @@ -555,16 +563,16 @@ def main(gff_file=None, fasta_file=None, stype=None, user_defined=None, dline=No

else:
if stype == 'pre_trans' or stype == 'gene' or stype == 'exon':
seq = extract_start_end(gff, stype, dline)
seq = extract_start_end(gff, stype, dline, embedded_fasta)
elif stype == 'trans':
feature_type = ['exon', 'pseudogenic_exon']
seq = splicer(gff, feature_type, dline, stype)
seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
elif stype == 'cds':
feature_type = ['CDS']
seq = splicer(gff, feature_type, dline, stype)
seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
elif stype == 'pep':
feature_type = ['CDS']
tmpseq = splicer(gff, feature_type, dline, stype)
tmpseq = splicer(gff, feature_type, dline, stype, embedded_fasta)
for k,v in tmpseq.items():
k = k.replace("|mRNA(CDS)|", "|peptide|")
#k = re.sub(r'(.*-)(R)(.)',r'\1P\3',k)
Expand Down Expand Up @@ -607,6 +615,7 @@ def script_main():
"""))
parser.add_argument('-g', '--gff', type=str, help='Genome annotation file in GFF3 format')
parser.add_argument('-f', '--fasta', type=str, help='Genome sequences in FASTA format')
parser.add_argument('-embf', '--embedded_fasta', action='store_true', help='Specify this option if you want to extract sequence from embedded fasta.', default=False)
parser.add_argument('-st', '--sequence_type', type=str, help="{0:s}\n\t{1:s}\n\t{2:s}\n\t{3:s}\n\t{4:s}\n\t{5:s}\n\t{6:s}\n\t{7:s}\n\t{8:s}".format('Type of sequences you would like to extract: ','"all" - FASTA files for all types of sequences listed below, except user_defined;','"gene" - gene sequence for each record;', '"exon" - exon sequence for each record;', '"pre_trans" - genomic region of a transcript model (premature transcript);', '"trans" - spliced transcripts (only exons included);', '"cds" - coding sequences;', '"pep" - peptide sequences;', '"user_defined" - specify parent and child features via the -u argument.'))
parser.add_argument('-u', '--user_defined', nargs='*', help="Specify parent and child features for fasta extraction, format: [parent feature type] [child feature type] (ex: -u mRNA CDS). Required if -st user_defined is given.")
parser.add_argument('-d', '--defline', type=str, help="{0:s}\n\t{1:s}\n\t{2:s}".format('Defline format in the output FASTA file:','"simple" - only ID would be shown in the defline;', '"complete" - complete information of the feature would be shown in the defline.'))
Expand All @@ -631,9 +640,10 @@ def script_main():
args.fasta = sys.stdin
logger_stderr.info('Reading from STDIN...')
else: # no input
parser.print_help()
logger_stderr.error('Required field -f missing...')
sys.exit(1)
if not args.embedded_fasta:
parser.print_help()
logger_stderr.error('Required field -f missing...')
sys.exit(1)

if args.sequence_type:
logger_stderr.info('Specifying sequence type: (%s)...', args.sequence_type)
Expand Down Expand Up @@ -661,4 +671,4 @@ def script_main():
sys.exit(1)


main(args.gff, args.fasta, args.sequence_type,args.user_defined, args.defline, args.quality_control, args.output_prefix, logger_stderr)
main(args.gff, args.fasta, args.embedded_fasta, args.sequence_type, args.user_defined, args.defline, args.quality_control, args.output_prefix, logger_stderr)
24 changes: 12 additions & 12 deletions gff3tool/lib/gff3/gff3.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,12 +379,13 @@ def check_reference(self, sequence_region=False, fasta_embedded=False, fasta_ext
self.add_line_error(line_data, {'message': 'Seqid not found in any ##sequence-region: {0:s}'.format(
seqid), 'error_type': 'BOUNDS', 'location': 'sequence_region', 'eCode': 'Esf0004'})
continue
if line_data['start'] < valid_sequence_regions[seqid]['start']:
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'Start is less than the ##sequence-region start: %d' % valid_sequence_regions[seqid]['start'], 'error_type': 'BOUNDS', 'location': 'sequence_region', 'eCode': 'Esf0005'})
if line_data['end'] > valid_sequence_regions[seqid]['end']:
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'End is greater than the ##sequence-region end: %d' % valid_sequence_regions[seqid]['end'], 'error_type': 'BOUNDS', 'location': 'sequence_region', 'eCode': 'Esf0006'})
if seqid not in unresolved_seqid:
if line_data['start'] < valid_sequence_regions[seqid]['start']:
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'Start is less than the ##sequence-region start: %d' % valid_sequence_regions[seqid]['start'], 'error_type': 'BOUNDS', 'location': 'sequence_region', 'eCode': 'Esf0005'})
if line_data['end'] > valid_sequence_regions[seqid]['end']:
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'End is greater than the ##sequence-region end: %d' % valid_sequence_regions[seqid]['end'], 'error_type': 'BOUNDS', 'location': 'sequence_region', 'eCode': 'Esf0006'})
elif sequence_region:
self.logger.debug('##sequence-region not found in GFF3')
# check fasta_embedded
Expand All @@ -398,9 +399,10 @@ def check_reference(self, sequence_region=False, fasta_embedded=False, fasta_ext
self.add_line_error(line_data, {'message': 'Seqid not found in the embedded ##FASTA: %s' % seqid, 'error_type': 'BOUNDS', 'location': 'fasta_embedded', 'eCode': 'Esf0007'})
continue
# check bounds
if line_data['end'] > len(self.fasta_embedded[seqid]['seq']):
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'End is greater than the embedded ##FASTA sequence length: %d' % len(self.fasta_embedded[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_embedded', 'eCode': 'Esf0008'})
if seqid not in unresolved_seqid:
if line_data['end'] > len(self.fasta_embedded[seqid]['seq']):
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'End is greater than the embedded ##FASTA sequence length: %d' % len(self.fasta_embedded[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_embedded', 'eCode': 'Esf0008'})
# check n
if check_n and line_data['type'] in check_n_feature_types:
"""
Expand Down Expand Up @@ -432,12 +434,10 @@ def check_reference(self, sequence_region=False, fasta_embedded=False, fasta_ext
self.add_line_error(line_data, {'message': 'Seqid not found in the external FASTA file: %s' % seqid, 'error_type': 'BOUNDS', 'location': 'fasta_external', 'eCode': 'Esf0010'})
continue
# check bounds
try:
if seqid not in unresolved_seqid:
if line_data['end'] > len(self.fasta_external[seqid]['seq']):
error_lines.add(line_data['line_index'])
self.add_line_error(line_data, {'message': 'End is greater than the external FASTA sequence length: %d' % len(self.fasta_external[seqid]['seq']), 'error_type': 'BOUNDS', 'location': 'fasta_external', 'eCode': 'Esf0011'})
except:
logger.warning('[Missing SeqID] Missing SeqID. \n\t\t- Line {0:s}: {1:s}'.format(str(line_data['line_index']+1), line_data['line_raw']))
# check n
if check_n and line_data['type'] in check_n_feature_types:
try:
Expand Down

0 comments on commit 78cb8ed

Please sign in to comment.