diff --git a/CHANGELOG.md b/CHANGELOG.md index 38d6bfb30..900c3718c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements and fixes -- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true -- [#1125](https://github.com/nf-core/rnaseq/issues/1125) - Pipeline fails if transcript_fasta not provided and skip_gtf_filter = true +- [[PR #1126](https://github.com/nf-core/rnaseq/pull/1126)] [[#1125](https://github.com/nf-core/rnaseq/issues/1125)] - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. +- [[PR #1127](https://github.com/nf-core/rnaseq/pull/1127)] - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py index 265250627..b7b4e972c 100755 --- a/bin/filter_gtf.py +++ b/bin/filter_gtf.py @@ -23,14 +23,14 @@ def extract_fasta_seq_names(fasta_name: str) -> Set[str]: def tab_delimited(file: str) -> float: """Check if file is tab-delimited and return median number of tabs.""" with open(file, "r") as f: - data = f.read(1024) + data = f.read(102400) return statistics.median(line.count("\t") for line in data.split("\n")) def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None: """Filter GTF file based on FASTA sequence names.""" if tab_delimited(gtf_in) != 8: - raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.") + raise ValueError("Invalid GTF file: Expected nine tab-separated columns.") seq_names_in_genome = extract_fasta_seq_names(fasta) logger.info(f"Extracted chromosome sequence names from {fasta}")