diff --git a/workflows/genome_annotation/lncRNAs_annotation/.dockstore.yml b/workflows/genome_annotation/lncRNAs_annotation/.dockstore.yml new file mode 100644 index 000000000..0f1660c7c --- /dev/null +++ b/workflows/genome_annotation/lncRNAs_annotation/.dockstore.yml @@ -0,0 +1,11 @@ +version: 1.2 +workflows: +- name: main + subclass: Galaxy + publish: true + primaryDescriptorPath: /Galaxy-Workflow-lncRNAs_annotation_workflow.ga + testParameterFiles: + - /Galaxy-Workflow-lncRNAs_annotation_workflow-tests.yml + authors: + - name: Romane Libouban + email: romane.libouban@irisa.fr diff --git a/workflows/genome_annotation/lncRNAs_annotation/CHANGELOG.md b/workflows/genome_annotation/lncRNAs_annotation/CHANGELOG.md new file mode 100644 index 000000000..c22e18ddc --- /dev/null +++ b/workflows/genome_annotation/lncRNAs_annotation/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## [0.1] + +Initial version of the lncRNAs annotation workflow. \ No newline at end of file diff --git a/workflows/genome_annotation/lncRNAs_annotation/Galaxy-Workflow-lncRNAs_annotation_workflow-tests.yml b/workflows/genome_annotation/lncRNAs_annotation/Galaxy-Workflow-lncRNAs_annotation_workflow-tests.yml new file mode 100644 index 000000000..8df077656 --- /dev/null +++ b/workflows/genome_annotation/lncRNAs_annotation/Galaxy-Workflow-lncRNAs_annotation_workflow-tests.yml @@ -0,0 +1,40 @@ +- doc: Test outline for Galaxy-Workflow-lncRNAs_annotation_workflow.ga + job: + Genome assembly: + class: File + location: https://zenodo.org/records/11367439/files/genome_assembly.fasta + filetype: fasta + Genome annotation: + class: File + location: https://zenodo.org/records/11367439/files/genome_annotation.gff3 + filetype: gff3 + RNA-Seq: + class: File + location: https://zenodo.org/records/11367439/files/SRR8534859_RNASeq_mapped.bam + filetype: bam + + + outputs: + genome_annotation_gtf: + location: https://zenodo.org/records/13941438/files/gffread.gtf?download=1 + compare: sim_size + delta: 300000 + + stringtie_gtf: + location: https://zenodo.org/records/13941438/files/StringTie.gtf?download=1 + compare: sim_size + delta: 300000 + + lcnRNA_annotation: + location: https://zenodo.org/records/13941438/files/lncRNA_annotation_FEELnc.gtf?download=1 + compare: sim_size + delta: 300000 + classification: + location: https://zenodo.org/records/13941438/files/Classifier_FEELnc.txt?download=1 + compare: sim_size + delta: 300000 + + lncRNA_genome_annotation: + location: https://zenodo.org/records/13941438/files/Concatenate_datasets.gtf?download=1 + compare: sim_size + delta: 300000 diff --git a/workflows/genome_annotation/lncRNAs_annotation/Galaxy-Workflow-lncRNAs_annotation_workflow.ga b/workflows/genome_annotation/lncRNAs_annotation/Galaxy-Workflow-lncRNAs_annotation_workflow.ga new file mode 100644 index 000000000..cb96004e8 --- /dev/null +++ b/workflows/genome_annotation/lncRNAs_annotation/Galaxy-Workflow-lncRNAs_annotation_workflow.ga @@ -0,0 +1,416 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "This workflow uses the FEELnc tool to annotate long non-coding RNAs. Before annotating these long non-coding RNAs, StringTie will be used to assemble the RNA-seq alignments into potential trancriptions. The gffread tool provides a genome annotation file in GTF format.", + "comments": [ + { + "child_steps": [ + 3 + ], + "color": "pink", + "data": { + "title": "Conversion from GFF3 to GTF format" + }, + "id": 2, + "position": [ + 443, + 398.4 + ], + "size": [ + 239, + 285 + ], + "type": "frame" + }, + { + "child_steps": [ + 1, + 2, + 0 + ], + "color": "yellow", + "data": { + "title": "Inputs" + }, + "id": 0, + "position": [ + 0, + 206.4 + ], + "size": [ + 243, + 326.2 + ], + "type": "frame" + }, + { + "child_steps": [ + 6 + ], + "color": "blue", + "data": { + "title": "Annotation with mRNA and lncRNA" + }, + "id": 4, + "position": [ + 1134.3, + 434.4 + ], + "size": [ + 268, + 262 + ], + "type": "frame" + }, + { + "child_steps": [ + 5 + ], + "color": "red", + "data": { + "title": "lncRNAs annotation with FEELnc" + }, + "id": 3, + "position": [ + 762.1, + 132.1 + ], + "size": [ + 240, + 322 + ], + "type": "frame" + }, + { + "child_steps": [ + 4 + ], + "color": "green", + "data": { + "title": "Transcripts assembly with StringTie" + }, + "id": 1, + "position": [ + 437.8, + 0.0 + ], + "size": [ + 236, + 256 + ], + "type": "frame" + } + ], + "creator": [ + { + "class": "Person", + "email": "mailto:romane.libouban@irisa.fr", + "name": "Romane Libouban" + } + ], + "format-version": "0.1", + "license": "MIT", + "release": "0.1", + "name": "lncRNAs annotation workflow", + "report": { + "markdown": "\n# Workflow Execution Report\n\n## Workflow Inputs\n```galaxy\ninvocation_inputs()\n```\n\n## Workflow Outputs\n```galaxy\ninvocation_outputs()\n```\n\n## Workflow\n```galaxy\nworkflow_display()\n```\n" + }, + "steps": { + "0": { + "annotation": "Genome assembly", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Genome assembly" + } + ], + "label": "Genome assembly", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 140, + "top": 130 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "6c129c58-d982-445f-b41c-c2c7387c5e81", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "Genome annotation", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Genome annotation" + } + ], + "label": "Genome annotation", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0, + "top": 290 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"gff3\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "c221afda-d336-4416-bfa0-f41a19c50097", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "RNA-Seq", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "RNA-Seq" + } + ], + "label": "RNA-Seq", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 40, + "top": 460 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"bam\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "1eccf2fb-d243-4be7-a952-a9d6d1375b89", + "when": null, + "workflow_outputs": [] + }, + "3": { + "annotation": "protein sequences extracted", + "content_id": "toolshed.g2.bx.psu.edu/repos/devteam/gffread/gffread/2.2.1.4+galaxy0", + "errors": null, + "id": 3, + "input_connections": { + "input": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [], + "label": "gffread", + "name": "gffread", + "outputs": [ + { + "name": "output_gtf", + "type": "gtf" + } + ], + "position": { + "left": 550, + "top": 380 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/gffread/gffread/2.2.1.4+galaxy0", + "tool_shed_repository": { + "changeset_revision": "3e436657dcd0", + "name": "gffread", + "owner": "devteam", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"chr_replace\": null, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"decode_url\": false, \"expose\": false, \"filtering\": null, \"full_gff_attribute_preservation\": false, \"gffs\": {\"gff_fmt\": \"gtf\", \"__current_case__\": 2, \"tname\": \"\"}, \"input\": {\"__class__\": \"ConnectedValue\"}, \"maxintron\": null, \"merging\": {\"merge_sel\": \"none\", \"__current_case__\": 0}, \"reference_genome\": {\"source\": \"none\", \"__current_case__\": 0}, \"region\": {\"region_filter\": \"none\", \"__current_case__\": 0}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.2.1.4+galaxy0", + "type": "tool", + "uuid": "b40d9096-4d41-47f7-a7ff-d23efa36c788", + "when": null, + "workflow_outputs": [ + { + "label": "genome_annotation_gtf", + "output_name": "output_gtf", + "uuid": "4357f14d-bf94-480b-ad72-a79ca909041d" + } + ] + }, + "4": { + "annotation": "Assembly step", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/stringtie/stringtie/2.2.3+galaxy0", + "errors": null, + "id": 4, + "input_connections": { + "guide|guide_source|ref_hist": { + "id": 1, + "output_name": "output" + }, + "input_options|input_bam": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool StringTie", + "name": "input_options" + } + ], + "label": "StringTie", + "name": "StringTie", + "outputs": [ + { + "name": "output_gtf", + "type": "gtf" + } + ], + "position": { + "left": 590, + "top": 0 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/stringtie/stringtie/2.2.3+galaxy0", + "tool_shed_repository": { + "changeset_revision": "cbf488da3b2c", + "name": "stringtie", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"adv\": {\"abundance_estimation\": false, \"omit_sequences\": \"\", \"name_prefix\": null, \"fraction\": \"0.01\", \"min_tlen\": \"200\", \"min_anchor_len\": \"10\", \"min_anchor_cov\": \"1\", \"min_bundle_cov\": \"1\", \"bdist\": \"50\", \"bundle_fraction\": \"1.0\", \"disable_trimming\": false, \"multi_mapping\": false, \"point_features\": null}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"guide\": {\"use_guide\": \"yes\", \"__current_case__\": 1, \"guide_source\": {\"guide_gff_select\": \"history\", \"__current_case__\": 1, \"ref_hist\": {\"__class__\": \"ConnectedValue\"}}, \"input_estimation\": false, \"special_outputs\": {\"special_outputs_select\": \"no\", \"__current_case__\": 2}, \"coverage_file\": false}, \"input_options\": {\"input_mode\": \"short_reads\", \"__current_case__\": 0, \"input_bam\": {\"__class__\": \"ConnectedValue\"}}, \"rna_strandness\": \"\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.2.3+galaxy0", + "type": "tool", + "uuid": "632f7928-d838-4bbf-b0f9-6a94ea9eb427", + "when": null, + "workflow_outputs": [ + { + "label": "stringtie_gtf", + "output_name": "output_gtf", + "uuid": "6edddada-3f7f-4a62-bbb7-8f412234342f" + } + ] + }, + "5": { + "annotation": "annotation of lncRNAs", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/feelnc/feelnc/0.2.1+galaxy0", + "errors": null, + "id": 5, + "input_connections": { + "candidate": { + "id": 4, + "output_name": "output_gtf" + }, + "genome": { + "id": 0, + "output_name": "output" + }, + "reference": { + "id": 3, + "output_name": "output_gtf" + } + }, + "inputs": [], + "label": "FEELnc", + "name": "FEELnc", + "outputs": [ + { + "name": "candidate_lncRNA", + "type": "gtf" + }, + { + "name": "candidate_mRNA", + "type": "gtf" + }, + { + "name": "classifier", + "type": "txt" + } + ], + "position": { + "left": 920, + "top": 160 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/feelnc/feelnc/0.2.1+galaxy0", + "tool_shed_repository": { + "changeset_revision": "55daa4712413", + "name": "feelnc", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"__input_ext\": \"input\", \"candidate\": {\"__class__\": \"ConnectedValue\"}, \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"genome\": {\"__class__\": \"ConnectedValue\"}, \"reference\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.2.1+galaxy0", + "type": "tool", + "uuid": "e7d02c0b-09a9-46e0-836a-048a3a5fdc33", + "when": null, + "workflow_outputs": [ + { + "label": "classification", + "output_name": "classifier", + "uuid": "a1e803b3-e991-4c7f-969c-7e73b0d19b62" + }, + { + "label": "lcnRNA_annotation", + "output_name": "candidate_lncRNA", + "uuid": "d146e06c-9a4f-4860-8dcd-d7524dee7ef5" + }, + { + "label": "mRNAs_annotation", + "output_name": "candidate_mRNA", + "uuid": "0313d195-5bc6-4fd0-99cc-9e4a21960ae8" + } + ] + }, + "6": { + "annotation": "final annotation", + "content_id": "cat1", + "errors": null, + "id": 6, + "input_connections": { + "input1": { + "id": 5, + "output_name": "candidate_lncRNA" + }, + "queries_0|input2": { + "id": 3, + "output_name": "output_gtf" + } + }, + "inputs": [], + "label": "Concatenate Dataset", + "name": "Concatenate datasets", + "outputs": [ + { + "name": "out_file1", + "type": "input" + } + ], + "position": { + "left": 1420, + "top": 370 + }, + "post_job_actions": {}, + "tool_id": "cat1", + "tool_state": "{\"__input_ext\": \"gtf\", \"chromInfo\": \"/opt/galaxy/tool-data/shared/ucsc/chrom/?.len\", \"input1\": {\"__class__\": \"ConnectedValue\"}, \"queries\": [{\"__index__\": 0, \"input2\": {\"__class__\": \"ConnectedValue\"}}], \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.0", + "type": "tool", + "uuid": "27355e91-6006-443f-af12-33b801ec371b", + "when": null, + "workflow_outputs": [ + { + "label": "lncRNA_genome_annotation", + "output_name": "out_file1", + "uuid": "39a4ad30-330d-4cf3-934c-6facba9a92d9" + } + ] + } + }, + "tags": [], + "uuid": "7699f72e-1083-442e-b4cc-90ab57e4783c", + "version": 1 +} \ No newline at end of file diff --git a/workflows/genome_annotation/lncRNAs_annotation/README.md b/workflows/genome_annotation/lncRNAs_annotation/README.md new file mode 100644 index 000000000..8a09cdc15 --- /dev/null +++ b/workflows/genome_annotation/lncRNAs_annotation/README.md @@ -0,0 +1,46 @@ +# lncRNAs annotation workflow + +This workflow uses the FEELnc tool to annotate long non-coding RNAs. Before annotating these long non-coding RNAs, StringTie will be used to assemble the RNA-seq alignments into potential trancriptions. The gffread tool provides a genome annotation file in GTF format. + +For future analyses, it would be interesting to use an updated annotation containing messenger RNA and long non-coding RNA. The concatenante tool merges the reference annotation with the long non-coding RNA annotation obtained with FEELnc. + +FEELnc is a 3-step pipeline: +- The first FEELnc “filter” step: consists of extracting and filtering out unwanted transcripts and transcripts overlapping the exons of the reference annotation. +- The second step, “codpot”: for coding potential, consists in calculating the coding potential of the transcripts. This step differentiates long non-coding RNAs from potential coding RNAs. +- The final step, “classifyier”, classifies the new long non-coding RNAs according to the location and direction of transcription of the proximal transcribed RNAs. + +## Input dataset for StringTie +StringTie requires two inputs: +- the RNA-seq alignment in bam format +- the genome annotation file in gff3 format + +## Outputs dataset for StringTie +StringTie generates an annotation file in GTF format. This file contains all the assembled transcripts present in the RNAseq data. + +## Input dataset for gffread +GFFRead requires an input file: the genome annotaton in gff3 format. + + +## Outputs dataset for gffread +An output file is generated in gtf format. + +## Input dataset for FEELnc +FEELnc requires 3 inputs: +- Transcript assembly in gtf format (corresponding to the StringTie output file) +- Reference annotation in gft format (corresponding to the gffread output file) +- Genome sequence in fasta format + +## Outputs dataset for FEELnc +FEELnc generates 3 output files: +- Long non-coding RNA annotation file in gtf format +- Annotation file for messenger RNAs in gtf format +- Classifier output file: table containing the classification of lncRNAs according to their genomic location in relation to other transcripts + + +## Input dataset for concatenate +Concatenate requires 2 inputs: +- genome annotation in gtf format +- annotation of long non-coding RNAs in gtf format + +## Outputs dataset for concatenate +An output file in GTF format is generated, containing the genome annotation and the annotation of long non-coding RNAs. \ No newline at end of file