From 4e478649368762b57e99cd626f35229ed2dbe32b Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 20 Nov 2023 20:51:19 +0100 Subject: [PATCH 1/4] Enlarge the sampling range for column determination in FilterGTF script. --- CHANGELOG.md | 5 +++-- bin/filter_gtf.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38d6bfb30..72092ef14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements and fixes -- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true -- [#1125](https://github.com/nf-core/rnaseq/issues/1125) - Pipeline fails if transcript_fasta not provided and skip_gtf_filter = true +- [[#1125](https://github.com/nf-core/rnaseq/issues/1125)][[#1126](https://github.com/nf-core/rnaseq/pull/1126)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [[#1127](https://github.com/nf-core/rnaseq/pull/)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. + ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 265250627..b7b4e972c 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -23,14 +23,14 @@ def extract_fasta_seq_names(fasta_name: str) -> Set[str]: def tab_delimited(file: str) -> float: """Check if file is tab-delimited and return median number of tabs.""" with open(file, "r") as f: - data = f.read(1024) + data = f.read(102400) return statistics.median(line.count("\t") for line in data.split("\n")) def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: - raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") + raise ValueError("Invalid GTF file: Expected nine tab-separated columns.") seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}") From 0b2da651d96691f274bea6e55a2360cb7d8e0b46 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 20 Nov 2023 21:00:15 +0100 Subject: [PATCH 2/4] Prettier on Markdown documents. --- CHANGELOG.md | 1 - docs/usage.md | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72092ef14..11e4ee828 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#1125](https://github.com/nf-core/rnaseq/issues/1125)][[#1126](https://github.com/nf-core/rnaseq/pull/1126)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. - [[#1127](https://github.com/nf-core/rnaseq/pull/)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. - ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes diff --git a/docs/usage.md b/docs/usage.md index 524acba9b..894f66f32 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,9 +93,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | From 6b85d1e9913af4d687beb981505abf07276ccb7e Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 20 Nov 2023 21:07:08 +0100 Subject: [PATCH 3/4] New version of Prettier, new changes to Markdown. Love it....not. --- docs/usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 894f66f32..524acba9b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,9 +93,9 @@ The `--umitools_grouping_method` parameter affects [how similar, but non-identic #### Examples: -| UMI type | Source | Pipeline parameters | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | +| UMI type | Source | Pipeline parameters | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| In read name | [Illumina BCL convert >3.7.5](https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl_convert/bcl-convert-v3-7-5-software-guide-1000000163594-00.pdf) | `--with_umi --skip_umi_extract --umitools_umi_separator ":"` | | In sequence | [Lexogen QuantSeq® 3’ mRNA-Seq V2 FWD](https://www.lexogen.com/quantseq-3mrna-sequencing) + [UMI Second Strand Synthesis Module](https://faqs.lexogen.com/faq/how-can-i-add-umis-to-my-quantseq-libraries) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{6})(?P.{4}).*"` | | In sequence | [Lexogen CORALL® Total RNA-Seq V1](https://www.lexogen.com/corall-total-rna-seq/)
> _mind [Appendix H](https://www.lexogen.com/wp-content/uploads/2020/04/095UG190V0130_CORALL-Total-RNA-Seq_2020-03-31.pdf) regarding optional trimming_ | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P.{12}).*"`
Optional: `--clip_r2 9 --three_prime_clip_r2 12` | | In sequence | [Takara Bio SMARTer® Stranded Total RNA-Seq Kit v3](https://www.takarabio.com/documents/User%20Manual/SMARTer%20Stranded%20Total%20RNA/SMARTer%20Stranded%20Total%20RNA-Seq%20Kit%20v3%20-%20Pico%20Input%20Mammalian%20User%20Manual-a_114949.pdf) | `--with_umi --umitools_extract_method "regex" --umitools_bc_pattern2 "^(?P.{8})(?P.{6}).*"` | From 9250c4c26b330b3ce038e231c94bbfe2b4b2267a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 20 Nov 2023 21:04:00 +0000 Subject: [PATCH 4/4] Update CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11e4ee828..900c3718c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements and fixes -- [[#1125](https://github.com/nf-core/rnaseq/issues/1125)][[#1126](https://github.com/nf-core/rnaseq/pull/1126)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. -- [[#1127](https://github.com/nf-core/rnaseq/pull/)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. +- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126)] [[#1125](https://github.com/nf-core/rnaseq/issues/1125)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [[PR #1127](https://github.com/nf-core/rnaseq/pull/1127)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17