diff --git a/docs/gff3_fix.md b/docs/gff3_fix.md index 8d63a9a..405fd96 100644 --- a/docs/gff3_fix.md +++ b/docs/gff3_fix.md @@ -17,15 +17,19 @@ Python 2.7 1. Corrected GFF3 ## Quick start -`python2.7 bin/gff3_fix.py -qc_r error.txt -g example.gff3 -og corrected.gff3` +`python2.7 bin/gff3_fix.py -qc_r error.txt -g example_file/example.gff3 -og corrected.gff3` ## Optional arguments 1. -h, --help - show this help message and exit -2. -og OUTPUT_GFF, --output_gff OUTPUT_GFF - - output gff3 file name -3. -v, --version +2. -qc_r QC_REPORT, --qc_report QC_REPORT + - Error report from gff3_QC.py +3. -g GFF, --gff GFF + - Genome annotation file, gff3 format +4. -og OUTPUT_GFF, --output_gff OUTPUT_GFF + - output gff3 file name (default: corrected.gff3) +5. -v, --version - show program's version number and exit ## More information diff --git a/docs/gff3_merge.md b/docs/gff3_merge.md index 67537ba..2e2932f 100644 --- a/docs/gff3_merge.md +++ b/docs/gff3_merge.md @@ -2,7 +2,7 @@ ## Usage -gff3_merge.py [-h] [-g1 GFF_FILE1] [-g2 GFF_FILE2] [-f FASTA] [-og OUTPUT_GFF] [-r REPORT_FILE] [-noAuto] [-v] +gff3_merge.py [-h] [-g1 GFF_FILE1] [-g2 GFF_FILE2] [-f FASTA] [-u1 USER_DEFINED_FILE1] [-u2 USER_DEFINED_FILE2] [-og OUTPUT_GFF] [-r REPORT_FILE] [-a] [-noAuto] [-v] ## Testing environment @@ -20,22 +20,34 @@ gff3_merge.py [-h] [-g1 GFF_FILE1] [-g2 GFF_FILE2] [-f FASTA] [-og OUTPUT_GFF] [ ## Quick start * Merge the two files with auto-assignment of replace tags (default) - `python2.7 GFF3toolkit/bin/gff3_merge.py -g1 GFF3toolkit/example_file/new_models.gff3 -g2 GFF3toolkit/example_file/reference.gff3 -f GFF3toolkit/example_file/reference.fa -og merged.gff -r merged_report.txt` + `python2.7 bin/gff3_merge.py -g1 example_file/new_models.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -r merged_report.txt` * If your GFF3 files have proper replace tags at column 9 (Format: replace=[Transcript ID]), you can merge the two GFF3 files without auto-assignment of replace tags. - `python2.7 GFF3toolkit/bin/gff3_merge.py -g1 GFF3toolkit/example_file/new_models_w_replace.gff3 -g2 GFF3toolkit/example_file/reference.gff3 -f GFF3toolkit/example_file/reference.fa -og merged.gff -r merged_report.txt -noAuto` + `python2.7 bin/gff3_merge.py -g1 example_file/new_models_w_replace.gff3 -g2 example_file/reference.gff3 -f example_file/reference.fa -og merged.gff -r merged_report.txt -noAuto` ## Optional arguments 1. -h, --help - show this help message and exit -2. -og OUTPUT_GFF, --output_gff OUTPUT_GFF +2. -g1 GFF_FILE1, --gff_file1 GFF_FILE1 + - Updated GFF3 file, such as Apollo gff +3. -g2 GFF_FILE2, --gff_file2 GFF_FILE2 + - Reference GFF3 file, such as Maker gff or OGS gff +4. -f FASTA, --fasta FASTA + - Genomic sequences in the fasta format +5. -u1 USER_DEFINED_FILE1, --user_defined_file1 USER_DEFINED_FILE1 + - File for specifing parent and child features for fasta extraction from updated GFF3 file. +6. -u2 USER_DEFINED_FILE2, --user_defined_file2 USER_DEFINED_FILE2 + - File for specifing parent and child features for fasta extraction from reference GFF3 file. +7. -og OUTPUT_GFF, --output_gff OUTPUT_GFF - The merged GFF3 file (default: merged.gff) -3. -r REPORT_FILE, --report_file REPORT_FILE +8. -r REPORT_FILE, --report_file REPORT_FILE - Log file for the integration (default: merge_report.txt) -4. -noAuto, --auto_assignment +9. -a, --all + - auto-assignment replace tags for all transcript features. (default: Only automatically assign replace tags for the transcript without replace tags) +10. -noAuto, --auto_assignment - Turn off the auto-assignment of replace tags, if you have had the replace tags in your update gff (default: Automatically assign replace tags and then merge the gff files) -5. -v, --version +11. -v, --version - show program's version number and exit ## More information diff --git a/gff3tool/bin/gff3_fix.py b/gff3tool/bin/gff3_fix.py index a8a01d9..bb8b35c 100755 --- a/gff3tool/bin/gff3_fix.py +++ b/gff3tool/bin/gff3_fix.py @@ -34,15 +34,28 @@ def script_main(): Quick start: - python2.7 bin/gff3_fix.py -qc_r error.txt -g example.gff3 -og corrected.gff3 + python2.7 bin/gff3_fix.py -qc_r error.txt -g example_file/example.gff3 -og corrected.gff3 """)) parser.add_argument('-qc_r', '--qc_report', type=str, help='Error report from gff3_QC.py') parser.add_argument('-g', '--gff', type=str, help='Genome annotation file, gff3 format') #parser.add_argument('-r', '--report', type=str, help='output report file name') - parser.add_argument('-og', '--output_gff', type=str, help='output gff3 file name') + parser.add_argument('-og', '--output_gff', type=str, help='output gff3 file name', default='corrected.gff3') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) + args = parser.parse_args() + if args.qc_report: + logger_stderr.info('Checking QC report file (%s)...', args.qc_report) + else: # no input + parser.print_help() + sys.exit() + + if args.gff: + logger_stderr.info('Checking GFF3 file (%s)...', args.gff) + else: # no input + parser.print_help() + sys.exit() + logger_stderr.info('Reading QC report file: (%s)...\n', args.qc_report) #error_dict example: {'Emr0001': [[15,16],[13]],'Esf0005': [[17]]} error_dict = {} diff --git a/gff3tool/bin/gff3_merge.py b/gff3tool/bin/gff3_merge.py index 42e5316..80f05a9 100755 --- a/gff3tool/bin/gff3_merge.py +++ b/gff3tool/bin/gff3_merge.py @@ -45,7 +45,7 @@ def check_replace(gff, user_defined1=None): return False -def main(gff_file1, gff_file2, fasta, report, output_gff, auto=True, user_defined1=None, user_defined2=None, logger=None): +def main(gff_file1, gff_file2, fasta, report, output_gff, all_assign=False, auto=True, user_defined1=None, user_defined2=None, logger=None): logger_null = logging.getLogger(__name__+'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) @@ -66,8 +66,8 @@ def main(gff_file1, gff_file2, fasta, report, output_gff, auto=True, user_define autoReviseReport = '{0:s}/replace_tag_report.txt'.format(autoDIR) logger.info('========== Auto-assignment of replace tags for each transcript model ==========') - gff3_merge.auto_replace_tag.main(gff1=gff_file1, gff2=gff_file2, fasta=fasta, outdir=autoDIR, scode='TEMP', user_defined1=user_defined1, user_defined2=user_defined2, logger=logger) - gff3_merge.revision.main(gff_file1, autoFILE, autoReviseGff, autoReviseReport, user_defined1, auto, logger) + gff3_merge.auto_replace_tag.main(gff1=gff_file1, gff2=gff_file2, fasta=fasta, outdir=autoDIR, scode='TEMP', all_assign=all_assign, user_defined1=user_defined1, user_defined2=user_defined2, logger=logger) + gff3_merge.revision.main(gff_file=gff_file1, revision_file=autoFILE, output_gff=autoReviseGff, report_file=autoReviseReport, user_defined1=user_defined1, auto=auto, logger=logger) logger.info('========== Check whether there are missing replace tags ==========') gff3 = Gff3(gff_file=autoReviseGff, logger=logger_null) @@ -141,6 +141,7 @@ def script_main(): parser.add_argument('-u2', '--user_defined_file2', type=str, help='File for specifing parent and child features for fasta extraction from reference GFF3 file.') parser.add_argument('-og', '--output_gff', type=str, help='The merged GFF3 file') parser.add_argument('-r', '--report_file', type=str, help='Log file for the integration') + parser.add_argument('-a', '--all', action='store_true', help='auto-assignment replace tags for all transcript features. (default: Only automatically assign replace tags for the transcript without replace tags)') parser.add_argument('-noAuto', '--auto_assignment', action='store_false', help='Turn off the auto-assignment of replace tags, if you already have replace tags in your updated gff (default: Automatically assign replace tags and then merge the gff files)') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) @@ -214,7 +215,9 @@ def script_main(): parser.print_help() sys.exit(1) - + if args.all and not args.auto_assignment: + logger_stderr.error('-a and -noAuto specify opposite behaviors, only one of the two arguments can be accepted.') + sys.exit(0) if args.report_file: logger_stderr.info('Writing validation report (%s)...\n', args.report_file) report_fh = open(args.report_file, 'wb') @@ -224,4 +227,4 @@ def script_main(): if not args.output_gff: args.output_gff='merged.gff' - main(args.gff_file1, args.gff_file2, args.fasta, report_fh, args.output_gff, args.auto_assignment, args.user_defined_file1, args.user_defined_file2, logger=logger_stderr) + main(args.gff_file1, args.gff_file2, args.fasta, report_fh, args.output_gff, args.all, args.auto_assignment, args.user_defined_file1, args.user_defined_file2, logger=logger_stderr) diff --git a/gff3tool/lib/gff3_merge/auto_replace_tag.py b/gff3tool/lib/gff3_merge/auto_replace_tag.py index 23ba291..44f2be8 100755 --- a/gff3tool/lib/gff3_merge/auto_replace_tag.py +++ b/gff3tool/lib/gff3_merge/auto_replace_tag.py @@ -29,7 +29,7 @@ __version__ = '0.0.3' -def main(gff1, gff2, fasta, outdir, scode, logger, user_defined1=None, user_defined2=None): +def main(gff1, gff2, fasta, outdir, scode, logger, all_assign=False, user_defined1=None, user_defined2=None): logger_null = logging.getLogger(__name__+'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) @@ -51,8 +51,12 @@ def main(gff1, gff2, fasta, outdir, scode, logger, user_defined1=None, user_defi roots =[] for line in gff3_1.lines: try: - if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent') and len(line['attributes']) != 0: - roots.append(line) + if line['line_type'] == 'feature': + # remove all the replace attributes + if all_assign and 'replace' in line['attributes']: + del line['attributes']['replace'] + if 'Parent' not in line['attributes'] and len(line['attributes']) != 0: + roots.append(line) except: pass for root in roots: @@ -75,11 +79,19 @@ def main(gff1, gff2, fasta, outdir, scode, logger, user_defined1=None, user_defi for lines in user_defined1: transcripts_type.add(lines[0]) for line in gff3_1.lines: + if line['line_type'] == 'feature': + if all_assign and 'replace' in line['attributes']: + del line['attributes']['replace'] if line['type'] in transcripts_type: id = str() if line['attributes'].has_key('ID'): id = line['attributes']['ID'] transcripts.add(id) + if all_assign: + # modified gff1 without any relace attributes + gff3_1_mod = '{0:s}/{1:s}'.format(tmpdir, 'gff1_mod.gff3') + gff3_1.write(gff3_1_mod) + gff1 = gff3_1_mod out1_type = '{0:s}/{1:s}'.format(tmpdir, 'gff1_transcript_type.txt') with open(out1_type, "w") as trans_type: diff --git a/gff3tool/lib/gff3_merge/revision.py b/gff3tool/lib/gff3_merge/revision.py index 440d8de..8137d32 100755 --- a/gff3tool/lib/gff3_merge/revision.py +++ b/gff3tool/lib/gff3_merge/revision.py @@ -33,7 +33,7 @@ __version__ = '1.0.3' -def main(gff_file, revision_file, output_gff, report_file=None,user_defined1=None, auto=True,logger=None): +def main(gff_file, revision_file, output_gff, report_file=None, user_defined1=None, auto=True, logger=None): logger_null = logging.getLogger(__name__+'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) @@ -331,4 +331,4 @@ def main(gff_file, revision_file, output_gff, report_file=None,user_defined1=Non if not args.output_gff: args.output_gff = 'Revised_{0:s}'.format(args.gff_file) - main(args.gff_file, args.revision_file, args.output_gff, args.report_file, logger=logger_stderr) + main(gff_file=args.gff_file, revision_file=args.revision_file, output_gff=args.output_gff, report_file=args.report_file, logger=logger_stderr)