From a2a05bd9c28895de1880b8f4ebe2388e7f787a75 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Tue, 30 Aug 2022 13:31:22 +0000 Subject: [PATCH 1/4] Template update for nf-core/tools version 2.5 --- .editorconfig | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 3 +- .github/workflows/ci.yml | 23 ++------ .github/workflows/linting.yml | 38 +++++++++++-- CHANGELOG.md | 2 +- CITATION.cff | 56 +++++++++++++++++++ README.md | 21 +++---- assets/email_template.txt | 1 - bin/check_samplesheet.py | 41 +++++++------- conf/base.config | 5 ++ docs/usage.md | 12 ++-- lib/WorkflowMag.groovy | 5 +- lib/WorkflowMain.groovy | 9 ++- main.nf | 2 +- modules.json | 22 +++++--- .../templates/dumpsoftwareversions.py | 14 +++-- nextflow.config | 23 +++++++- 17 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 CITATION.cff diff --git a/.editorconfig b/.editorconfig index b6b31907..b78de6e6 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js}] +[*.{md,yml,yaml,html,css,scss,js,cff}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 89c37cb4..2b1d3b46 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,8 +15,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/mag/ - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/mag/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/mag _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/mag/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/mag _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2d7a9053..74145c59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,6 @@ on: env: NXF_ANSI_LOG: false - CAPSULE_LOG: none jobs: test: @@ -20,27 +19,17 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - # Nextflow versions - include: - # Test pipeline minimum Nextflow version - - NXF_VER: "21.10.3" - NXF_EDGE: "" - # Test latest edge release of Nextflow - - NXF_VER: "" - NXF_EDGE: "1" + NXF_VER: + - "21.10.3" + - "latest-everything" steps: - name: Check out pipeline code uses: actions/checkout@v2 - name: Install Nextflow - env: - NXF_VER: ${{ matrix.NXF_VER }} - # Uncomment only if the edge release is more recent than the latest stable release - # See https://github.com/nextflow-io/nextflow/issues/2467 - # NXF_EDGE: ${{ matrix.NXF_EDGE }} - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77358dee..8a5ce69b 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -35,6 +35,36 @@ jobs: - name: Run Prettier --check run: prettier --check ${GITHUB_WORKSPACE} + PythonBlack: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Check code lints with Black + uses: psf/black@stable + + # If the above check failed, post a comment on the PR explaining the failure + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## Python linting (`black`) is failing + + To keep the code consistent with lots of contributors, we run automated code consistency checks. + To fix this CI test, please run: + + * Install [`black`](https://black.readthedocs.io/en/stable/): `pip install black` + * Fix formatting errors in your pipeline: `black .` + + Once you push these changes the test should pass, and you can hide this comment :+1: + + We highly recommend setting up Black in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + nf-core: runs-on: ubuntu-latest steps: @@ -42,15 +72,11 @@ jobs: uses: actions/checkout@v2 - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 - uses: actions/setup-python@v3 with: - python-version: "3.6" + python-version: "3.7" architecture: "x64" - name: Install dependencies diff --git a/CHANGELOG.md b/CHANGELOG.md index 0803027f..5224713a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v2.2.0dev - [date] +## v2.3.0dev - [date] Initial release of nf-core/mag, created with the [nf-core](https://nf-co.re/) template. diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..4533e2f2 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,56 @@ +cff-version: 1.2.0 +message: "If you use `nf-core tools` in your work, please cite the `nf-core` publication" +authors: + - family-names: Ewels + given-names: Philip + - family-names: Peltzer + given-names: Alexander + - family-names: Fillinger + given-names: Sven + - family-names: Patel + given-names: Harshil + - family-names: Alneberg + given-names: Johannes + - family-names: Wilm + given-names: Andreas + - family-names: Ulysse Garcia + given-names: Maxime + - family-names: Di Tommaso + given-names: Paolo + - family-names: Nahnsen + given-names: Sven +title: "The nf-core framework for community-curated bioinformatics pipelines." +version: 2.4.1 +doi: 10.1038/s41587-020-0439-x +date-released: 2022-05-16 +url: https://github.com/nf-core/tools +prefered-citation: + type: article + authors: + - family-names: Ewels + given-names: Philip + - family-names: Peltzer + given-names: Alexander + - family-names: Fillinger + given-names: Sven + - family-names: Patel + given-names: Harshil + - family-names: Alneberg + given-names: Johannes + - family-names: Wilm + given-names: Andreas + - family-names: Ulysse Garcia + given-names: Maxime + - family-names: Di Tommaso + given-names: Paolo + - family-names: Nahnsen + given-names: Sven + doi: 10.1038/s41587-020-0439-x + journal: nature biotechnology + start: 276 + end: 278 + title: "The nf-core framework for community-curated bioinformatics pipelines." + issue: 3 + volume: 38 + year: 2020 + url: https://dx.doi.org/10.1038/s41587-020-0439-x diff --git a/README.md b/README.md index bf57fbc3..bb443efd 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,14 @@ # ![nf-core/mag](docs/images/nf-core-mag_logo_light.png#gh-light-mode-only) ![nf-core/mag](docs/images/nf-core-mag_logo_dark.png#gh-dark-mode-only) -[![GitHub Actions CI Status](https://github.com/nf-core/mag/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/mag/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/mag/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/mag/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?logo=Amazon%20AWS)](https://nf-co.re/mag/results) -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg)](https://www.nextflow.io/) -[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) -[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) -[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/mag) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23mag-4A154B?logo=slack)](https://nfcore.slack.com/channels/mag) -[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?logo=twitter)](https://twitter.com/nf_core) -[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23mag-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/mag)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction @@ -25,7 +20,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/mag/results). +On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/mag/results). ## Pipeline summary @@ -42,7 +37,7 @@ On release, automated continuous integration tests run the pipeline on a full-si 3. Download the pipeline and test it on a minimal dataset with a single command: - ```console + ```bash nextflow run nf-core/mag -profile test,YOURPROFILE --outdir ``` @@ -57,7 +52,7 @@ On release, automated continuous integration tests run the pipeline on a full-si - ```console + ```bash nextflow run nf-core/mag --input samplesheet.csv --outdir --genome GRCh37 -profile ``` diff --git a/assets/email_template.txt b/assets/email_template.txt index b5f77e8d..bcdde465 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -6,7 +6,6 @@ `._,._,' nf-core/mag v${version} ---------------------------------------------------- - Run Name: $runName <% if (success){ diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 3652c63c..9a8b8962 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -11,7 +11,6 @@ from collections import Counter from pathlib import Path - logger = logging.getLogger() @@ -79,13 +78,15 @@ def validate_and_transform(self, row): def _validate_sample(self, row): """Assert that the sample name exists and convert spaces to underscores.""" - assert len(row[self._sample_col]) > 0, "Sample input is required." + if len(row[self._sample_col]) <= 0: + raise AssertionError("Sample input is required.") # Sanitize samples slightly. row[self._sample_col] = row[self._sample_col].replace(" ", "_") def _validate_first(self, row): """Assert that the first FASTQ entry is non-empty and has the right format.""" - assert len(row[self._first_col]) > 0, "At least the first FASTQ file is required." + if len(row[self._first_col]) <= 0: + raise AssertionError("At least the first FASTQ file is required.") self._validate_fastq_format(row[self._first_col]) def _validate_second(self, row): @@ -97,36 +98,34 @@ def _validate_pair(self, row): """Assert that read pairs have the same file extension. Report pair status.""" if row[self._first_col] and row[self._second_col]: row[self._single_col] = False - assert ( - Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:] - ), "FASTQ pairs must have the same file extensions." + if Path(row[self._first_col]).suffixes[-2:] != Path(row[self._second_col]).suffixes[-2:]: + raise AssertionError("FASTQ pairs must have the same file extensions.") else: row[self._single_col] = True def _validate_fastq_format(self, filename): """Assert that a given filename has one of the expected FASTQ extensions.""" - assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), ( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) + if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): + raise AssertionError( + f"The FASTQ file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FORMATS)}" + ) def validate_unique_samples(self): """ Assert that the combination of sample name and FASTQ filename is unique. - In addition to the validation, also rename the sample if more than one sample, - FASTQ file combination exists. + In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the + number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. """ - assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." - if len({pair[0] for pair in self._seen}) < len(self._seen): - counts = Counter(pair[0] for pair in self._seen) - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - if counts[sample] > 1: - row[self._sample_col] = f"{sample}_T{seen[sample]}" + if len(self._seen) != len(self.modified): + raise AssertionError("The pair of sample name and FASTQ must be unique.") + seen = Counter() + for row in self.modified: + sample = row[self._sample_col] + seen[sample] += 1 + row[self._sample_col] = f"{sample}_T{seen[sample]}" def read_head(handle, num_lines=10): diff --git a/conf/base.config b/conf/base.config index a3173752..70688b6c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -26,6 +26,11 @@ process { // adding in your local modules too. // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } diff --git a/docs/usage.md b/docs/usage.md index 81b33493..1a7f91f5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -12,7 +12,7 @@ You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. -```console +```bash --input '[path to samplesheet file]' ``` @@ -56,7 +56,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: -```console +```bash nextflow run nf-core/mag --input samplesheet.csv --outdir --genome GRCh37 -profile docker ``` @@ -64,9 +64,9 @@ This will launch the pipeline with the `docker` configuration profile. See below Note that the pipeline will create the following files in your working directory: -```console +```bash work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` @@ -75,7 +75,7 @@ work # Directory containing the nextflow working files When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: -```console +```bash nextflow pull nf-core/mag ``` @@ -251,6 +251,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -```console +```bash NXF_OPTS='-Xms1g -Xmx4g' ``` diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy index 9e3adf54..a0c25181 100755 --- a/lib/WorkflowMag.groovy +++ b/lib/WorkflowMag.groovy @@ -10,6 +10,7 @@ class WorkflowMag { public static void initialise(params, log) { genomeExistsError(params, log) + if (!params.fasta) { log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." System.exit(1) @@ -41,9 +42,7 @@ class WorkflowMag { yaml_file_text += "data: |\n" yaml_file_text += "${summary_section}" return yaml_file_text - } - - // + }// // Exit pipeline if incorrect --genome key provided // private static void genomeExistsError(params, log) { diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 317c61e2..b83b19a5 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -59,6 +59,7 @@ class WorkflowMain { } // Print parameter summary log to screen + log.info paramsSummaryLog(workflow, params, log) // Check that a -profile or Nextflow config has been provided to run the pipeline @@ -78,17 +79,15 @@ class WorkflowMain { System.exit(1) } } - // // Get attribute from genome config file e.g. fasta // - public static String getGenomeAttribute(params, attribute) { - def val = '' + public static Object getGenomeAttribute(params, attribute) { if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { if (params.genomes[ params.genome ].containsKey(attribute)) { - val = params.genomes[ params.genome ][ attribute ] + return params.genomes[ params.genome ][ attribute ] } } - return val + return null } } diff --git a/main.nf b/main.nf index 1bcbb091..366722cd 100644 --- a/main.nf +++ b/main.nf @@ -4,7 +4,7 @@ nf-core/mag ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/mag - Website: https://nf-co.re/mag +Website: https://nf-co.re/mag Slack : https://nfcore.slack.com/channels/mag ---------------------------------------------------------------------------------------- */ diff --git a/modules.json b/modules.json index 1768fed6..8154138d 100644 --- a/modules.json +++ b/modules.json @@ -3,14 +3,20 @@ "homePage": "https://github.com/nf-core/mag", "repos": { "nf-core/modules": { - "custom/dumpsoftwareversions": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" - }, - "fastqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" - }, - "multiqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "git_url": "https://github.com/nf-core/modules.git", + "modules": { + "custom/dumpsoftwareversions": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", + "branch": "master" + }, + "fastqc": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", + "branch": "master" + }, + "multiqc": { + "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", + "branch": "master" + } } } } diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index d1390392..787bdb7b 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -1,9 +1,10 @@ #!/usr/bin/env python -import yaml import platform from textwrap import dedent +import yaml + def _make_versions_html(versions): html = [ @@ -58,11 +59,12 @@ def _make_versions_html(versions): for process, process_versions in versions_by_process.items(): module = process.split(":")[-1] try: - assert versions_by_module[module] == process_versions, ( - "We assume that software versions are the same between all modules. " - "If you see this error-message it means you discovered an edge-case " - "and should open an issue in nf-core/tools. " - ) + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) except KeyError: versions_by_module[module] = process_versions diff --git a/nextflow.config b/nextflow.config index ffb73b89..4e61d600 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,11 +13,11 @@ params { // Input options input = null + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes' igenomes_ignore = false - // MultiQC options multiqc_config = null multiqc_title = null @@ -37,6 +37,7 @@ params { schema_ignore_params = 'genomes' enable_conda = false + // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" @@ -45,6 +46,7 @@ params { config_profile_url = null config_profile_name = null + // Max resource options // Defaults only, expecting to be overwritten max_memory = '128.GB' @@ -72,6 +74,7 @@ try { // } + profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { @@ -82,6 +85,15 @@ profiles { shifter.enabled = false charliecloud.enabled = false } + mamba { + params.enable_conda = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } docker { docker.enabled = true docker.userEmulation = true @@ -119,10 +131,16 @@ profiles { podman.enabled = false shifter.enabled = false } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB + } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } + // Load igenomes.config if required if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' @@ -130,6 +148,7 @@ if (!params.igenomes_ignore) { params.genomes = [:] } + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -169,7 +188,7 @@ manifest { description = 'Assembly, binning and annotation of metagenomes' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' - version = '2.2.0dev' + version = '2.3.0dev' } // Load modules.config for DSL2 module specific options From 8d4a93f0dde94504e588ebd71813e425b87a2d03 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Thu, 1 Sep 2022 13:28:21 +0000 Subject: [PATCH 2/4] Template update for nf-core/tools version 2.5.1 --- bin/check_samplesheet.py | 9 ++++++--- pyproject.toml | 10 ++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100644 pyproject.toml diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 9a8b8962..11b15572 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -98,7 +98,9 @@ def _validate_pair(self, row): """Assert that read pairs have the same file extension. Report pair status.""" if row[self._first_col] and row[self._second_col]: row[self._single_col] = False - if Path(row[self._first_col]).suffixes[-2:] != Path(row[self._second_col]).suffixes[-2:]: + first_col_suffix = Path(row[self._first_col]).suffixes[-2:] + second_col_suffix = Path(row[self._second_col]).suffixes[-2:] + if first_col_suffix != second_col_suffix: raise AssertionError("FASTQ pairs must have the same file extensions.") else: row[self._single_col] = True @@ -157,7 +159,7 @@ def sniff_format(handle): handle.seek(0) sniffer = csv.Sniffer() if not sniffer.has_header(peek): - logger.critical(f"The given sample sheet does not appear to contain a header.") + logger.critical("The given sample sheet does not appear to contain a header.") sys.exit(1) dialect = sniffer.sniff(peek) return dialect @@ -195,7 +197,8 @@ def check_samplesheet(file_in, file_out): reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): - logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.") + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") sys.exit(1) # Validate each row. checker = RowChecker() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..0d62beb6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.black] +line-length = 120 +target_version = ["py37", "py38", "py39", "py310"] + +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 From 2fd132ce1d62990ceea59e71fb1f4de458f458cc Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Tue, 4 Oct 2022 21:53:31 +0000 Subject: [PATCH 3/4] Template update for nf-core/tools version 2.6 --- .github/workflows/awsfulltest.yml | 4 ++ .github/workflows/awstest.yml | 4 ++ .prettierignore | 1 + CITATION.cff | 8 +-- assets/adaptivecard.json | 67 +++++++++++++++++++ assets/methods_description_template.yml | 25 +++++++ assets/multiqc_config.yml | 6 +- docs/usage.md | 8 +++ lib/NfcoreTemplate.groovy | 55 +++++++++++++++ lib/Utils.groovy | 21 ++++-- lib/WorkflowMag.groovy | 19 ++++++ main.nf | 3 +- modules.json | 27 ++++---- .../custom/dumpsoftwareversions/main.nf | 8 +-- .../custom/dumpsoftwareversions/meta.yml | 0 .../templates/dumpsoftwareversions.py | 0 modules/nf-core/{modules => }/fastqc/main.nf | 12 ++++ modules/nf-core/{modules => }/fastqc/meta.yml | 0 modules/nf-core/modules/multiqc/main.nf | 31 --------- modules/nf-core/multiqc/main.nf | 53 +++++++++++++++ .../nf-core/{modules => }/multiqc/meta.yml | 15 +++++ nextflow.config | 5 +- nextflow_schema.json | 18 +++++ workflows/mag.nf | 26 ++++--- 24 files changed, 345 insertions(+), 71 deletions(-) create mode 100644 assets/adaptivecard.json create mode 100644 assets/methods_description_template.yml mode change 100755 => 100644 lib/Utils.groovy rename modules/nf-core/{modules => }/custom/dumpsoftwareversions/main.nf (79%) rename modules/nf-core/{modules => }/custom/dumpsoftwareversions/meta.yml (100%) rename modules/nf-core/{modules => }/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py (100%) rename modules/nf-core/{modules => }/fastqc/main.nf (85%) rename modules/nf-core/{modules => }/fastqc/meta.yml (100%) delete mode 100644 modules/nf-core/modules/multiqc/main.nf create mode 100644 modules/nf-core/multiqc/main.nf rename modules/nf-core/{modules => }/multiqc/meta.yml (73%) diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index e7365baf..8cc24aae 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -28,3 +28,7 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/mag/results-${{ github.sha }}" } profiles: test_full,aws_tower + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index e5f69c5d..7a6ff7ef 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -23,3 +23,7 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/mag/results-test-${{ github.sha }}" } profiles: test,aws_tower + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.prettierignore b/.prettierignore index d0e7ae58..eb74a574 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,4 +1,5 @@ email_template.html +adaptivecard.json .nextflow* work/ data/ diff --git a/CITATION.cff b/CITATION.cff index 4533e2f2..017666c0 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -13,8 +13,8 @@ authors: given-names: Johannes - family-names: Wilm given-names: Andreas - - family-names: Ulysse Garcia - given-names: Maxime + - family-names: Garcia + given-names: Maxime Ulysse - family-names: Di Tommaso given-names: Paolo - family-names: Nahnsen @@ -39,8 +39,8 @@ prefered-citation: given-names: Johannes - family-names: Wilm given-names: Andreas - - family-names: Ulysse Garcia - given-names: Maxime + - family-names: Garcia + given-names: Maxime Ulysse - family-names: Di Tommaso given-names: Paolo - family-names: Nahnsen diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 00000000..30bab078 --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "nf-core/mag v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml new file mode 100644 index 00000000..daf6c450 --- /dev/null +++ b/assets/methods_description_template.yml @@ -0,0 +1,25 @@ +id: "nf-core-mag-methods-description" +description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." +section_name: "nf-core/mag Methods Description" +section_href: "https://github.com/nf-core/mag" +plot_type: "html" +## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## You inject any metadata in the Nextflow '${workflow}' object +data: | +

Methods

+

Data was processed using nf-core/mag v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

+
${workflow.commandLine}
+

References

+
    +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
+
+
Notes:
+
    + ${nodoi_text} +
  • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
  • +
  • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
  • +
+
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index db2c548c..fcfdb30c 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -3,9 +3,11 @@ report_comment: > analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: - software_versions: + "nf-core-mag-methods-description": order: -1000 - "nf-core-mag-summary": + software_versions: order: -1001 + "nf-core-mag-summary": + order: -1002 export_plots: true diff --git a/docs/usage.md b/docs/usage.md index 1a7f91f5..f0ae31a3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -237,6 +237,14 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +## Azure Resource Requests + +To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. +We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. + +Note that the choice of VM size depends on your quota and the overall workload during the analysis. +For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). + ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 2fc0a9b9..27feb009 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -145,6 +145,61 @@ class NfcoreTemplate { output_tf.withWriter { w -> w << email_txt } } + // + // Construct and send adaptive card + // https://adaptivecards.io + // + public static void adaptivecard(workflow, params, summary_params, projectDir, log) { + def hook_url = params.hook_url + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) misc_fields['repository'] = workflow.repository + if (workflow.commitId) misc_fields['commitid'] = workflow.commitId + if (workflow.revision) misc_fields['revision'] = workflow.revision + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + + def msg_fields = [:] + msg_fields['version'] = workflow.manifest.version + msg_fields['runName'] = workflow.runName + msg_fields['success'] = workflow.success + msg_fields['dateComplete'] = workflow.complete + msg_fields['duration'] = workflow.duration + msg_fields['exitStatus'] = workflow.exitStatus + msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + msg_fields['errorReport'] = (workflow.errorReport ?: 'None') + msg_fields['commandLine'] = workflow.commandLine + msg_fields['projectDir'] = workflow.projectDir + msg_fields['summary'] = summary << misc_fields + + // Render the JSON template + def engine = new groovy.text.GStringTemplateEngine() + def hf = new File("$projectDir/assets/adaptivecard.json") + def json_template = engine.createTemplate(hf).make(msg_fields) + def json_message = json_template.toString() + + // POST + def post = new URL(hook_url).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true) + post.setRequestProperty("Content-Type", "application/json") + post.getOutputStream().write(json_message.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (! postRC.equals(200)) { + log.warn(post.getErrorStream().getText()); + } + } + // // Print pipeline summary on completion // diff --git a/lib/Utils.groovy b/lib/Utils.groovy old mode 100755 new mode 100644 index 28567bd7..8d030f4e --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,19 +21,26 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { + if (channels_missing | channel_priority_violation) { log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy index a0c25181..c0b35db2 100755 --- a/lib/WorkflowMag.groovy +++ b/lib/WorkflowMag.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the workflow/mag.nf in the nf-core/mag pipeline // +import groovy.text.SimpleTemplateEngine + class WorkflowMag { // @@ -42,6 +44,23 @@ class WorkflowMag { yaml_file_text += "data: |\n" yaml_file_text += "${summary_section}" return yaml_file_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] + meta.workflow = run_workflow.toMap() + meta["manifest_map"] = run_workflow.manifest.toMap() + + meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + def methods_text = mqc_methods_yaml.text + + def engine = new SimpleTemplateEngine() + def description_html = engine.createTemplate(methods_text).make(meta) + + return description_html }// // Exit pipeline if incorrect --genome key provided // diff --git a/main.nf b/main.nf index 366722cd..567cbdc1 100644 --- a/main.nf +++ b/main.nf @@ -4,7 +4,8 @@ nf-core/mag ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/mag -Website: https://nf-co.re/mag + + Website: https://nf-co.re/mag Slack : https://nfcore.slack.com/channels/mag ---------------------------------------------------------------------------------------- */ diff --git a/modules.json b/modules.json index 8154138d..3b4687ce 100644 --- a/modules.json +++ b/modules.json @@ -2,20 +2,21 @@ "name": "nf-core/mag", "homePage": "https://github.com/nf-core/mag", "repos": { - "nf-core/modules": { - "git_url": "https://github.com/nf-core/modules.git", + "https://github.com/nf-core/modules.git": { "modules": { - "custom/dumpsoftwareversions": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", - "branch": "master" - }, - "fastqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", - "branch": "master" - }, - "multiqc": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d", - "branch": "master" + "nf-core": { + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "fastqc": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "multiqc": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + } } } } diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf similarity index 79% rename from modules/nf-core/modules/custom/dumpsoftwareversions/main.nf rename to modules/nf-core/custom/dumpsoftwareversions/main.nf index 327d5100..cebb6e05 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -1,11 +1,11 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { - label 'process_low' + label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml similarity index 100% rename from modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml rename to modules/nf-core/custom/dumpsoftwareversions/meta.yml diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py similarity index 100% rename from modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py rename to modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py diff --git a/modules/nf-core/modules/fastqc/main.nf b/modules/nf-core/fastqc/main.nf similarity index 85% rename from modules/nf-core/modules/fastqc/main.nf rename to modules/nf-core/fastqc/main.nf index ed6b8c50..05730368 100644 --- a/modules/nf-core/modules/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -44,4 +44,16 @@ process FASTQC { END_VERSIONS """ } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.html + touch ${prefix}.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + END_VERSIONS + """ } diff --git a/modules/nf-core/modules/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml similarity index 100% rename from modules/nf-core/modules/fastqc/meta.yml rename to modules/nf-core/fastqc/meta.yml diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf deleted file mode 100644 index 1264aac1..00000000 --- a/modules/nf-core/modules/multiqc/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process MULTIQC { - label 'process_medium' - - conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" - - input: - path multiqc_files - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - multiqc -f $args . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf new file mode 100644 index 00000000..a8159a57 --- /dev/null +++ b/modules/nf-core/multiqc/main.nf @@ -0,0 +1,53 @@ +process MULTIQC { + label 'process_single' + + conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + + input: + path multiqc_files, stageAs: "?/*" + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + """ + multiqc \\ + --force \\ + $args \\ + $config \\ + $extra_config \\ + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ + + stub: + """ + touch multiqc_data + touch multiqc_plots + touch multiqc_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml similarity index 73% rename from modules/nf-core/modules/multiqc/meta.yml rename to modules/nf-core/multiqc/meta.yml index 6fa891ef..ebc29b27 100644 --- a/modules/nf-core/modules/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -12,11 +12,25 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] + input: - multiqc_files: type: file description: | List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. + pattern: "*.{yml,yaml}" + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + output: - report: type: file @@ -38,3 +52,4 @@ authors: - "@abhi18av" - "@bunop" - "@drpatelh" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 4e61d600..1f540181 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,7 +21,9 @@ params { // MultiQC options multiqc_config = null multiqc_title = null + multiqc_logo = null max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options outdir = null @@ -31,6 +33,7 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false + hook_url = null help = false validate_params = true show_hidden_params = false @@ -74,7 +77,6 @@ try { // } - profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { @@ -189,6 +191,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' version = '2.3.0dev' + doi = '' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 2721a11e..e53cd1d6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -213,12 +213,30 @@ "fa_icon": "fas fa-palette", "hidden": true }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, only MS Teams is supported.", + "hidden": true + }, "multiqc_config": { "type": "string", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", diff --git a/workflows/mag.nf b/workflows/mag.nf index 9a755509..5acb38c5 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -23,8 +23,10 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,9 +48,9 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/modules/fastqc/main' -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,15 +91,20 @@ workflow MAG { workflow_summary = WorkflowMag.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowMag.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + ch_methods_description = Channel.value(methods_description) + ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect(), + ch_multiqc_config.collect().ifEmpty([]), + ch_multiqc_custom_config.collect().ifEmpty([]), + ch_multiqc_logo.collect().ifEmpty([]) ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) @@ -114,6 +121,9 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.adaptivecard(workflow, params, summary_params, projectDir, log) + } } /* From 8fbded0ec55db5fc818f458fdb1f4c4a0727f9c3 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Thu, 6 Oct 2022 10:56:12 +0200 Subject: [PATCH 4/4] Blackify --- bin/combine_tables.py | 46 ++++++--- bin/filter_ssu.py | 23 +---- bin/get_mag_depths.py | 54 ++++++---- bin/get_mag_depths_summary.py | 24 ++++- bin/multiqc_to_custom_tsv.py | 108 +++++++++++++++----- bin/plot_mag_depths.py | 50 ++++++--- bin/split_fasta.py | 58 ++++++----- bin/summary_busco.py | 163 +++++++++++++++++++---------- bin/summary_gtdbtk.py | 187 +++++++++++++++++++++++++--------- 9 files changed, 486 insertions(+), 227 deletions(-) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index fe1e6291..58f7683e 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -5,14 +5,22 @@ import os.path import pandas as pd + def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('-d', "--depths_summary", required=True, metavar='FILE', help="Bin depths summary file.") - parser.add_argument('-b', "--busco_summary", metavar='FILE', help="BUSCO summary file.") - parser.add_argument('-q', "--quast_summary", metavar='FILE', help="QUAST BINS summary file.") - parser.add_argument('-g', "--gtdbtk_summary", metavar='FILE', help="GTDB-Tk summary file.") + parser.add_argument("-d", "--depths_summary", required=True, metavar="FILE", help="Bin depths summary file.") + parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.") + parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") + parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") - parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final summary.") + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing final summary.", + ) return parser.parse_args(args) @@ -28,28 +36,36 @@ def main(args=None): # handle bin depths results = pd.read_csv(args.depths_summary, sep="\t") - results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns ] - bins = results['bin'].sort_values().reset_index(drop=True) + results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] + bins = results["bin"].sort_values().reset_index(drop=True) if args.busco_summary: busco_results = pd.read_csv(args.busco_summary, sep="\t") - if not bins.equals(busco_results['GenomeBin'].sort_values().reset_index(drop=True)): + if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") - results = pd.merge(results, busco_results, left_on="bin", right_on="GenomeBin", how='outer') # assuming depths for all bins are given + results = pd.merge( + results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" + ) # assuming depths for all bins are given if args.quast_summary: quast_results = pd.read_csv(args.quast_summary, sep="\t") - if not bins.equals(quast_results['Assembly'].sort_values().reset_index(drop=True)): + if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") - results = pd.merge(results, quast_results, left_on="bin", right_on="Assembly", how='outer') # assuming depths for all bins are given + results = pd.merge( + results, quast_results, left_on="bin", right_on="Assembly", how="outer" + ) # assuming depths for all bins are given if args.gtdbtk_summary: gtdbtk_results = pd.read_csv(args.gtdbtk_summary, sep="\t") - if not bins.equals(gtdbtk_results['user_genome'].sort_values().reset_index(drop=True)): - sys.exit("Bins in GTDB-Tk summary do not match bins in BUSCO summary!") # GTDB-Tk can currently anyway only run in combination with BUSCO - results = pd.merge(results, gtdbtk_results, left_on="GenomeBin", right_on="user_genome", how='outer') # assuming BUSCO summary must be given + if not bins.equals(gtdbtk_results["user_genome"].sort_values().reset_index(drop=True)): + sys.exit( + "Bins in GTDB-Tk summary do not match bins in BUSCO summary!" + ) # GTDB-Tk can currently anyway only run in combination with BUSCO + results = pd.merge( + results, gtdbtk_results, left_on="GenomeBin", right_on="user_genome", how="outer" + ) # assuming BUSCO summary must be given - results.to_csv(args.out, sep='\t') + results.to_csv(args.out, sep="\t") if __name__ == "__main__": diff --git a/bin/filter_ssu.py b/bin/filter_ssu.py index 665f51a2..7e89989b 100755 --- a/bin/filter_ssu.py +++ b/bin/filter_ssu.py @@ -28,25 +28,10 @@ def filter(args): def main(): - parser = argparse.ArgumentParser( - prog="filter_ssu.py", - usage="filter ssu hits from refinem" - ) - parser.add_argument( - "--evalue", - help="evalue threshold" - ) - parser.add_argument( - "ssu", - metavar="ssu.tsv", - help="ssu tsv file generated by refinem" - ) - parser.add_argument( - "output", - metavar="output.tsv", - default="output.tsv", - help="output file name" - ) + parser = argparse.ArgumentParser(prog="filter_ssu.py", usage="filter ssu hits from refinem") + parser.add_argument("--evalue", help="evalue threshold") + parser.add_argument("ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem") + parser.add_argument("output", metavar="output.tsv", default="output.tsv", help="output file name") parser.set_defaults(func=filter) args = parser.parse_args() diff --git a/bin/get_mag_depths.py b/bin/get_mag_depths.py index 64418068..55d73ac4 100755 --- a/bin/get_mag_depths.py +++ b/bin/get_mag_depths.py @@ -13,14 +13,25 @@ def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('-b', '--bins' , required=True, nargs="+", metavar='FILE' , help="Bins: FASTA containing all contigs.") - parser.add_argument('-d', '--depths' , required=True , metavar='FILE' , help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].") - parser.add_argument('-a', '--assembler' , required=True , type=str , help="Assembler name.") - parser.add_argument('-i', '--id' , required=True , type=str , help="Sample or group id.") - parser.add_argument('-m', '--binner' , required=True , type=str , help="Binning method.") + parser.add_argument( + "-b", "--bins", required=True, nargs="+", metavar="FILE", help="Bins: FASTA containing all contigs." + ) + parser.add_argument( + "-d", + "--depths", + required=True, + metavar="FILE", + help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].", + ) + parser.add_argument("-a", "--assembler", required=True, type=str, help="Assembler name.") + parser.add_argument("-i", "--id", required=True, type=str, help="Sample or group id.") + parser.add_argument("-m", "--binner", required=True, type=str, help="Binning method.") return parser.parse_args(args) + + # Processing contig depths for each binner again, i.e. not the most efficient way, but ok + def main(args=None): args = parse_args(args) @@ -28,46 +39,51 @@ def main(args=None): sample_names = [] dict_contig_depths = {} with gzip.open(args.depths, "rt") as infile: - reader = csv.reader(infile, delimiter = "\t") + reader = csv.reader(infile, delimiter="\t") # process header header = next(reader) - for sample in range(int((len(header)-3)/2)): - col_name = header[3+2*sample] + for sample in range(int((len(header) - 3) / 2)): + col_name = header[3 + 2 * sample] # retrieve sample name: "--.bam" - sample_name = col_name[len(args.assembler)+1+len(args.id)+1:-4] + sample_name = col_name[len(args.assembler) + 1 + len(args.id) + 1 : -4] sample_names.append(sample_name) # process contig depths for row in reader: contig_depths = [] - for sample in range(int((len(row)-3)/2)): - contig_depths.append(float(row[3+2*sample])) + for sample in range(int((len(row) - 3) / 2)): + contig_depths.append(float(row[3 + 2 * sample])) dict_contig_depths[str(row[0])] = contig_depths # Initialize output files n_samples = len(sample_names) - with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", 'w') as outfile: - print("bin", '\t'.join(sample_names), sep='\t', file=outfile) + with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w") as outfile: + print("bin", "\t".join(sample_names), sep="\t", file=outfile) # for each bin, access contig depths and compute mean bin depth (for all samples) for file in args.bins: all_depths = [[] for i in range(n_samples)] - if file.endswith('.gz'): - with gzip.open(file, 'rt') as infile: - for rec in SeqIO.parse(infile,'fasta'): + if file.endswith(".gz"): + with gzip.open(file, "rt") as infile: + for rec in SeqIO.parse(infile, "fasta"): contig_depths = dict_contig_depths[rec.id] for sample in range(n_samples): all_depths[sample].append(contig_depths[sample]) else: with open(file, "rt") as infile: - for rec in SeqIO.parse(infile,'fasta'): + for rec in SeqIO.parse(infile, "fasta"): contig_depths = dict_contig_depths[rec.id] for sample in range(n_samples): all_depths[sample].append(contig_depths[sample]) binname = os.path.basename(file) - with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", 'a') as outfile: - print(binname, '\t'.join(str(statistics.median(sample_depths)) for sample_depths in all_depths), sep='\t', file=outfile) + with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a") as outfile: + print( + binname, + "\t".join(str(statistics.median(sample_depths)) for sample_depths in all_depths), + sep="\t", + file=outfile, + ) if __name__ == "__main__": diff --git a/bin/get_mag_depths_summary.py b/bin/get_mag_depths_summary.py index e70e640e..6dbc6f75 100755 --- a/bin/get_mag_depths_summary.py +++ b/bin/get_mag_depths_summary.py @@ -7,19 +7,35 @@ def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('-d', '--depths' , required=True, nargs="+", metavar='FILE' , help="TSV file for each assembly and binning method containing bin depths for samples: bin, sample1, ....") - parser.add_argument('-o', "--out" , required=True , metavar='FILE', type=argparse.FileType('w'), help="Output file containing depths for all assemblies, binning methods and all samples.") + parser.add_argument( + "-d", + "--depths", + required=True, + nargs="+", + metavar="FILE", + help="TSV file for each assembly and binning method containing bin depths for samples: bin, sample1, ....", + ) + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing depths for all assemblies, binning methods and all samples.", + ) return parser.parse_args(args) + def main(args=None): args = parse_args(args) results = pd.DataFrame() for assembly_depths_file in args.depths: assembly_results = pd.read_csv(assembly_depths_file, index_col="bin", sep="\t") - results = results.append(assembly_results, sort=True, verify_integrity=True) + results = results.append(assembly_results, sort=True, verify_integrity=True) + + results.to_csv(args.out, sep="\t") - results.to_csv(args.out, sep='\t') if __name__ == "__main__": sys.exit(main()) diff --git a/bin/multiqc_to_custom_tsv.py b/bin/multiqc_to_custom_tsv.py index 544ca0f4..6488e31d 100755 --- a/bin/multiqc_to_custom_tsv.py +++ b/bin/multiqc_to_custom_tsv.py @@ -9,11 +9,26 @@ def parse_args(args=None): - Description = 'Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline.' + Description = ( + "Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline." + ) Epilog = "Example usage: python multiqc_to_custom_tsv.py" parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument('-md', '--multiqc_data_dir', type=str, dest="MULTIQC_DATA_DIR", default='multiqc_data', help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').") - parser.add_argument('-se', '--single_end', dest="SINGLE_END", action='store_true', help="Specifies that the input is single-end reads.") + parser.add_argument( + "-md", + "--multiqc_data_dir", + type=str, + dest="MULTIQC_DATA_DIR", + default="multiqc_data", + help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", + ) + parser.add_argument( + "-se", + "--single_end", + dest="SINGLE_END", + action="store_true", + help="Specifies that the input is single-end reads.", + ) return parser.parse_args(args) @@ -31,13 +46,13 @@ def make_dir(path): def find_tag(d, tag): if tag in d: yield d[tag] - for k,v in d.items(): + for k, v in d.items(): if isinstance(v, dict): for i in find_tag(v, tag): yield i -def yaml_fields_to_dict(YAMLFile,AppendDict={},FieldMappingList=[]): +def yaml_fields_to_dict(YAMLFile, AppendDict={}, FieldMappingList=[]): with open(YAMLFile) as f: yaml_dict = yaml.safe_load(f) for k in yaml_dict.keys(): @@ -45,7 +60,7 @@ def yaml_fields_to_dict(YAMLFile,AppendDict={},FieldMappingList=[]): if key not in AppendDict: AppendDict[key] = {} if FieldMappingList != []: - for i,j in FieldMappingList: + for i, j in FieldMappingList: val = list(find_tag(yaml_dict[k], j[0])) if len(val) != 0: val = val[0] @@ -54,33 +69,60 @@ def yaml_fields_to_dict(YAMLFile,AppendDict={},FieldMappingList=[]): if i not in AppendDict[key]: AppendDict[key][i] = val else: - print('WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.'.format(i,YAMLFile)) + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, YAMLFile + ) + ) else: AppendDict[key] = yaml_dict[k] return AppendDict + # customized -def metrics_dict_to_file(FileFieldList,MultiQCDataDir,OutFile,se): +def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se): MetricsDict = {} FieldList = [] - for yamlFile,mappingList in FileFieldList: - yamlFile = os.path.join(MultiQCDataDir,yamlFile) + for yamlFile, mappingList in FileFieldList: + yamlFile = os.path.join(MultiQCDataDir, yamlFile) if os.path.exists(yamlFile): - MetricsDict = yaml_fields_to_dict(YAMLFile=yamlFile,AppendDict=MetricsDict,FieldMappingList=mappingList) + MetricsDict = yaml_fields_to_dict(YAMLFile=yamlFile, AppendDict=MetricsDict, FieldMappingList=mappingList) FieldList += [x[0] for x in mappingList] else: - print('WARNING: File does not exist: {}'.format(yamlFile)) + print("WARNING: File does not exist: {}".format(yamlFile)) if MetricsDict != {}: make_dir(os.path.dirname(OutFile)) - with open(OutFile,'w') as fout: + with open(OutFile, "w") as fout: if se: - fout.write('{}\n'.format('\t'.join(['Sample', 'SE reads not mapped (kept)', 'SE reads mapped (discarded)']))) + fout.write( + "{}\n".format("\t".join(["Sample", "SE reads not mapped (kept)", "SE reads mapped (discarded)"])) + ) else: - fout.write('{}\n'.format('\t'.join(['Sample', 'PE reads not mapped concordantly (kept)', 'PE reads mapped concordantly (discarded)']))) + fout.write( + "{}\n".format( + "\t".join( + [ + "Sample", + "PE reads not mapped concordantly (kept)", + "PE reads mapped concordantly (discarded)", + ] + ) + ) + ) for k in sorted(MetricsDict.keys()): # write out # not mapped reads and # mapped reads (uniquely + multi mapping reads) - fout.write('{}\n'.format('\t'.join([k, str(MetricsDict[k][FieldList[0]]), str(MetricsDict[k][FieldList[1]] + MetricsDict[k][FieldList[2]])]))) + fout.write( + "{}\n".format( + "\t".join( + [ + k, + str(MetricsDict[k][FieldList[0]]), + str(MetricsDict[k][FieldList[1]] + MetricsDict[k][FieldList[2]]), + ] + ) + ) + ) return MetricsDict @@ -91,23 +133,35 @@ def main(args=None): Bowtie2FileFieldList = [] if args.SINGLE_END: Bowtie2FileFieldList = [ - ('multiqc_bowtie2.yaml', [('# Not mapped reads', ['unpaired_aligned_none']), - ('# Mapped reads 1', ['unpaired_aligned_one']), - ('# Mapped reads multi', ['unpaired_aligned_multi'])]), + ( + "multiqc_bowtie2.yaml", + [ + ("# Not mapped reads", ["unpaired_aligned_none"]), + ("# Mapped reads 1", ["unpaired_aligned_one"]), + ("# Mapped reads multi", ["unpaired_aligned_multi"]), + ], + ), ] else: Bowtie2FileFieldList = [ - ('multiqc_bowtie2.yaml', [('# Not mapped reads', ['paired_aligned_none']), - ('# Mapped reads 1', ['paired_aligned_one']), - ('# Mapped reads multi', ['paired_aligned_multi'])]), + ( + "multiqc_bowtie2.yaml", + [ + ("# Not mapped reads", ["paired_aligned_none"]), + ("# Mapped reads 1", ["paired_aligned_one"]), + ("# Mapped reads multi", ["paired_aligned_multi"]), + ], + ), ] ## Write Bowtie 2 metrics to file - metrics_dict_to_file(FileFieldList=Bowtie2FileFieldList, - MultiQCDataDir=args.MULTIQC_DATA_DIR, - OutFile='host_removal_metrics.tsv', - se=args.SINGLE_END) + metrics_dict_to_file( + FileFieldList=Bowtie2FileFieldList, + MultiQCDataDir=args.MULTIQC_DATA_DIR, + OutFile="host_removal_metrics.tsv", + se=args.SINGLE_END, + ) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/bin/plot_mag_depths.py b/bin/plot_mag_depths.py index 5e7bff24..aab38473 100755 --- a/bin/plot_mag_depths.py +++ b/bin/plot_mag_depths.py @@ -9,11 +9,24 @@ import seaborn as sns from scipy import stats + def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('-d', '--bin_depths' , required=True, metavar='FILE' , help="Bin depths file in TSV format (for one assembly and binning method): bin, sample1_depth, sample2_depth, ....") - parser.add_argument('-g', '--groups' , required=True, metavar='FILE' , help="File in TSV format containing group information for samples: sample, group") - parser.add_argument('-o', "--out" , required=True, metavar='FILE', type=str, help="Output file.") + parser.add_argument( + "-d", + "--bin_depths", + required=True, + metavar="FILE", + help="Bin depths file in TSV format (for one assembly and binning method): bin, sample1_depth, sample2_depth, ....", + ) + parser.add_argument( + "-g", + "--groups", + required=True, + metavar="FILE", + help="File in TSV format containing group information for samples: sample, group", + ) + parser.add_argument("-o", "--out", required=True, metavar="FILE", type=str, help="Output file.") return parser.parse_args(args) @@ -21,31 +34,38 @@ def main(args=None): args = parse_args(args) # load data - df = pd.read_csv(args.bin_depths, sep='\t', index_col=0) - groups = pd.read_csv(args.groups, sep='\t', index_col=0, names=['sample', 'group']) + df = pd.read_csv(args.bin_depths, sep="\t", index_col=0) + groups = pd.read_csv(args.groups, sep="\t", index_col=0, names=["sample", "group"]) # add pseudo-abundances (sample-wise? dependent on lib-size) pseudo_cov = 0.1 * df[df > 0].min().min() df.replace(0, pseudo_cov, inplace=True) # compute centered log-ratios # divide df by sample-wise geometric means - gmeans = stats.gmean(df, axis=0) # apply on axis=0: 'index' - df = np.log(df.div(gmeans, axis='columns')) # divide column-wise (axis=1|'columns'), take natural logorithm - df.index.name='MAGs' - df.columns.name='Samples' + gmeans = stats.gmean(df, axis=0) # apply on axis=0: 'index' + df = np.log(df.div(gmeans, axis="columns")) # divide column-wise (axis=1|'columns'), take natural logorithm + df.index.name = "MAGs" + df.columns.name = "Samples" # prepare colors for group information - color_map= dict(zip(groups['group'].unique(), sns.color_palette(n_colors=len(groups['group'].unique())))) + color_map = dict(zip(groups["group"].unique(), sns.color_palette(n_colors=len(groups["group"].unique())))) # plot plt.figure() - bin_labels=True - if (len(df) > 30): - bin_labels=False - sns.clustermap(df, row_cluster=True, yticklabels=bin_labels, cmap="vlag", center=0, col_colors=groups.group.map(color_map), figsize=(6,6)) + bin_labels = True + if len(df) > 30: + bin_labels = False + sns.clustermap( + df, + row_cluster=True, + yticklabels=bin_labels, + cmap="vlag", + center=0, + col_colors=groups.group.map(color_map), + figsize=(6, 6), + ) plt.savefig(args.out) if __name__ == "__main__": sys.exit(main()) - diff --git a/bin/split_fasta.py b/bin/split_fasta.py index 07d369a6..c9149f25 100755 --- a/bin/split_fasta.py +++ b/bin/split_fasta.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -#USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> +# USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> import pandas as pd import gzip @@ -19,20 +19,20 @@ min_length_to_retain_contig = int(argv[4]) # Base name for file output -if input_file.endswith('.gz'): +if input_file.endswith(".gz"): rm_ext = input_file.replace(".gz", "") - out_base = out_base = re.sub(r'\.fasta$|\.fa$|\.fna$', '', rm_ext) + out_base = out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", rm_ext) else: - out_base = re.sub(r'\.fasta$|\.fa$|\.fna$', '', input_file) + out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", input_file) # Data structures to separate and store sequences -df_above_threshold = pd.DataFrame(columns=['id','seq','length']) -pooled = [] -remaining = [] +df_above_threshold = pd.DataFrame(columns=["id", "seq", "length"]) +pooled = [] +remaining = [] -if input_file.endswith('.gz'): - with gzip.open(input_file, 'rt') as f: - fasta_sequences = SeqIO.parse(f,'fasta') +if input_file.endswith(".gz"): + with gzip.open(input_file, "rt") as f: + fasta_sequences = SeqIO.parse(f, "fasta") for fasta in fasta_sequences: name, sequence = fasta.id, str(fasta.seq) @@ -40,16 +40,18 @@ # store each sequence above threshold together with its length into df if length >= length_threshold: - df_above_threshold = df_above_threshold.append({"id":name, "seq":sequence, "length":length}, ignore_index = True) + df_above_threshold = df_above_threshold.append( + {"id": name, "seq": sequence, "length": length}, ignore_index=True + ) # contigs to retain and pool elif length >= min_length_to_retain_contig: - pooled.append(SeqRecord(Seq(sequence, generic_dna), id = name)) + pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name)) # remaining sequences else: - remaining.append(SeqRecord(Seq(sequence, generic_dna), id = name)) + remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name)) else: with open(input_file) as f: - fasta_sequences = SeqIO.parse(f,'fasta') + fasta_sequences = SeqIO.parse(f, "fasta") for fasta in fasta_sequences: name, sequence = fasta.id, str(fasta.seq) @@ -57,28 +59,30 @@ # store each sequence above threshold together with its length into df if length >= length_threshold: - df_above_threshold = df_above_threshold.append({"id":name, "seq":sequence, "length":length}, ignore_index = True) + df_above_threshold = df_above_threshold.append( + {"id": name, "seq": sequence, "length": length}, ignore_index=True + ) # contigs to retain and pool elif length >= min_length_to_retain_contig: - pooled.append(SeqRecord(Seq(sequence, generic_dna), id = name)) + pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name)) # remaining sequences else: - remaining.append(SeqRecord(Seq(sequence, generic_dna), id = name)) + remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name)) # Sort sequences above threshold by length -df_above_threshold.sort_values(by=['length'], ascending=False, inplace=True) +df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True) df_above_threshold.reset_index(drop=True, inplace=True) # Write `max_sequences` longest sequences (above threshold) into separate files, add remainder to pooled for index, row in df_above_threshold.iterrows(): - if index+1 <= max_sequences: - print("write "+out_base+"."+str(index+1)+".fa") - out = (SeqRecord(Seq(row['seq'], generic_dna), id = row['id'])) - SeqIO.write(out, out_base+"."+str(index+1)+".fa", "fasta") + if index + 1 <= max_sequences: + print("write " + out_base + "." + str(index + 1) + ".fa") + out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"]) + SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta") else: - pooled.append(SeqRecord(Seq(row['seq'], generic_dna), id = row['id'])) + pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"])) -print("write "+out_base+".pooled.fa") -SeqIO.write(pooled, out_base+".pooled.fa", "fasta") -print("write "+out_base+".remaining.fa") -SeqIO.write(remaining, out_base+".remaining.fa", "fasta") +print("write " + out_base + ".pooled.fa") +SeqIO.write(pooled, out_base + ".pooled.fa", "fasta") +print("write " + out_base + ".remaining.fa") +SeqIO.write(remaining, out_base + ".remaining.fa", "fasta") diff --git a/bin/summary_busco.py b/bin/summary_busco.py index e404a124..b4a8c99b 100755 --- a/bin/summary_busco.py +++ b/bin/summary_busco.py @@ -8,13 +8,37 @@ import os.path import pandas as pd + def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('-a', '--auto', default=False, action='store_true', help="BUSCO run in auto lineage selection mode.") - parser.add_argument('-sd', "--summaries_domain", nargs="+", metavar='FILE', help="List of BUSCO summary files for domains.") - parser.add_argument('-ss', "--summaries_specific", nargs="+", metavar='FILE', help="List of BUSCO summary files for specific lineages.") - parser.add_argument('-f', "--failed_bins", nargs="+", metavar='FILE', help="List of files containing bin name for which BUSCO analysis failed.") - parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final BUSCO summary.") + parser.add_argument( + "-a", "--auto", default=False, action="store_true", help="BUSCO run in auto lineage selection mode." + ) + parser.add_argument( + "-sd", "--summaries_domain", nargs="+", metavar="FILE", help="List of BUSCO summary files for domains." + ) + parser.add_argument( + "-ss", + "--summaries_specific", + nargs="+", + metavar="FILE", + help="List of BUSCO summary files for specific lineages.", + ) + parser.add_argument( + "-f", + "--failed_bins", + nargs="+", + metavar="FILE", + help="List of files containing bin name for which BUSCO analysis failed.", + ) + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing final BUSCO summary.", + ) return parser.parse_args(args) @@ -27,50 +51,66 @@ def main(args=None): # "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa" # " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148" - regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r"# The lineage dataset is: (\S+) \(", r" C:(\S+)%\[S:", - r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"] - columns_domain = ["GenomeBin", \ - "Domain", \ - "%Complete (domain)", \ - "%Complete and single-copy (domain)", \ - "%Complete and duplicated (domain)", \ - "%Fragmented (domain)", \ - "%Missing (domain)", \ - "Total number (domain)"] - columns_specific = ["GenomeBin", \ - "Specific lineage dataset", \ - "%Complete (specific)", \ - "%Complete and single-copy (specific)", \ - "%Complete and duplicated (specific)", \ - "%Fragmented (specific)", \ - "%Missing (specific)", \ - "Total number (specific)"] + regexes = [ + r"# Summarized benchmarking in BUSCO notation for file (\S+)", + r"# The lineage dataset is: (\S+) \(", + r" C:(\S+)%\[S:", + r"%\[S:(\S+)%,D:", + r"%,D:(\S+)%\],F:", + r"%\],F:(\S+)%,M:", + r"%,M:(\S+)%,n:", + r"%,n:(\S+)", + ] + columns_domain = [ + "GenomeBin", + "Domain", + "%Complete (domain)", + "%Complete and single-copy (domain)", + "%Complete and duplicated (domain)", + "%Fragmented (domain)", + "%Missing (domain)", + "Total number (domain)", + ] + columns_specific = [ + "GenomeBin", + "Specific lineage dataset", + "%Complete (specific)", + "%Complete and single-copy (specific)", + "%Complete and duplicated (specific)", + "%Fragmented (specific)", + "%Missing (specific)", + "Total number (specific)", + ] if args.auto: - columns = ["GenomeBin", \ - "Domain", \ - "%Complete (domain)", \ - "%Complete and single-copy (domain)", \ - "%Complete and duplicated (domain)", \ - "%Fragmented (domain)", \ - "%Missing (domain)", \ - "Total number (domain)", \ - "Specific lineage dataset", \ - "%Complete (specific)", \ - "%Complete and single-copy (specific)", \ - "%Complete and duplicated (specific)", \ - "%Fragmented (specific)", \ - "%Missing (specific)", \ - "Total number (specific)"] + columns = [ + "GenomeBin", + "Domain", + "%Complete (domain)", + "%Complete and single-copy (domain)", + "%Complete and duplicated (domain)", + "%Fragmented (domain)", + "%Missing (domain)", + "Total number (domain)", + "Specific lineage dataset", + "%Complete (specific)", + "%Complete and single-copy (specific)", + "%Complete and duplicated (specific)", + "%Fragmented (specific)", + "%Missing (specific)", + "Total number (specific)", + ] else: - columns = ["GenomeBin", \ - "Specific lineage dataset", \ - "%Complete (specific)", \ - "%Complete and single-copy (specific)", \ - "%Complete and duplicated (specific)", \ - "%Fragmented (specific)", \ - "%Missing (specific)", \ - "Total number (specific)"] + columns = [ + "GenomeBin", + "Specific lineage dataset", + "%Complete (specific)", + "%Complete and single-copy (specific)", + "%Complete and duplicated (specific)", + "%Fragmented (specific)", + "%Missing (specific)", + "Total number (specific)", + ] # Search each summary file using its regex results_domain = [] @@ -112,10 +152,26 @@ def main(args=None): with open(file) as infile: line = infile.readline() # in case of failed placements domain summary was used and specific part will be filled with NAs when merging - if re.split(r'[\t\n]', line)[1] != "Placements failed": - failed_bin = re.split(r'[\t\n]', line)[0] + if re.split(r"[\t\n]", line)[1] != "Placements failed": + failed_bin = re.split(r"[\t\n]", line)[0] if args.auto: - results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA] + results = [ + failed_bin, + pd.NA, + "0.0", + "0.0", + "0.0", + "0.0", + "100.0", + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] else: results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA] failed.append(results) @@ -123,15 +179,12 @@ def main(args=None): # merge results if args.auto: - df_final = df_domain\ - .merge(df_specific, on="GenomeBin", how='outer')\ - .append(df_failed) + df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append(df_failed) # check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses' - df_final.loc[pd.isna(df_final['Domain']) & pd.notna(df_final['Specific lineage dataset']), 'Domain'] = "Viruses" + df_final.loc[pd.isna(df_final["Domain"]) & pd.notna(df_final["Specific lineage dataset"]), "Domain"] = "Viruses" else: - df_final = df_specific\ - .append(df_failed) + df_final = df_specific.append(df_failed) df_final.to_csv(args.out, sep="\t", index=False) diff --git a/bin/summary_gtdbtk.py b/bin/summary_gtdbtk.py index 2f402acc..44bb7d1d 100755 --- a/bin/summary_gtdbtk.py +++ b/bin/summary_gtdbtk.py @@ -6,15 +6,48 @@ import os.path import pandas as pd + def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument('-x', "--extension", required=True, type=str, help="File extension passed to GTDB-TK and substracted by GTDB-Tk from bin names in results files.") - parser.add_argument('-s', "--summaries", nargs="+", metavar='FILE', help="List of GTDB-tk summary files.") - parser.add_argument('-fi', "--filtered_bins", nargs="+", metavar='FILE', help="List of files containing names of bins which where filtered out during GTDB-tk analysis.") - parser.add_argument('-fa', "--failed_bins", nargs="+", metavar='FILE', help="List of files containing bin names for which GTDB-tk analysis failed.") - parser.add_argument('-d', "--qc_discarded_bins", nargs="+", metavar='FILE', type=str, help="List of files containing names of bins which were discarded based on BUSCO metrics.") - - parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final GTDB-tk summary.") + parser.add_argument( + "-x", + "--extension", + required=True, + type=str, + help="File extension passed to GTDB-TK and substracted by GTDB-Tk from bin names in results files.", + ) + parser.add_argument("-s", "--summaries", nargs="+", metavar="FILE", help="List of GTDB-tk summary files.") + parser.add_argument( + "-fi", + "--filtered_bins", + nargs="+", + metavar="FILE", + help="List of files containing names of bins which where filtered out during GTDB-tk analysis.", + ) + parser.add_argument( + "-fa", + "--failed_bins", + nargs="+", + metavar="FILE", + help="List of files containing bin names for which GTDB-tk analysis failed.", + ) + parser.add_argument( + "-d", + "--qc_discarded_bins", + nargs="+", + metavar="FILE", + type=str, + help="List of files containing names of bins which were discarded based on BUSCO metrics.", + ) + + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing final GTDB-tk summary.", + ) return parser.parse_args(args) @@ -24,82 +57,144 @@ def main(args=None): if not args.summaries and not args.filtered_bins and not args.failed_bins and not args.qc_discarded_bins: sys.exit("Either --summaries, --filtered_bins, --failed_bins or --qc_discarded_bins must be specified!") - columns = ["user_genome", \ - "classification", \ - "fastani_reference", \ - "fastani_reference_radius", \ - "fastani_taxonomy", \ - "fastani_ani", \ - "fastani_af", \ - "closest_placement_reference", \ - "closest_placement_radius", \ - "closest_placement_taxonomy", \ - "closest_placement_ani", \ - "closest_placement_af", \ - "pplacer_taxonomy", \ - "classification_method", \ - "note", \ - "other_related_references(genome_id,species_name,radius,ANI,AF)", \ - "msa_percent", \ - "translation_table", \ - "red_value", \ - "warnings"] + columns = [ + "user_genome", + "classification", + "fastani_reference", + "fastani_reference_radius", + "fastani_taxonomy", + "fastani_ani", + "fastani_af", + "closest_placement_reference", + "closest_placement_radius", + "closest_placement_taxonomy", + "closest_placement_ani", + "closest_placement_af", + "pplacer_taxonomy", + "classification_method", + "note", + "other_related_references(genome_id,species_name,radius,ANI,AF)", + "msa_percent", + "translation_table", + "red_value", + "warnings", + ] # Note: currently all columns included # For bins already discarded based on BUSCO QC metrics discarded = [] if args.qc_discarded_bins: for bin_name in args.qc_discarded_bins: - bin_results = [bin_name, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA] + bin_results = [ + bin_name, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] discarded.append(bin_results) df_final = pd.DataFrame(discarded, columns=columns) - df_final.set_index('user_genome', inplace=True) + df_final.set_index("user_genome", inplace=True) # For bins with succesfull GTDB-tk classification if args.summaries: for file in args.summaries: - df_summary = pd.read_csv(file, sep='\t')[columns] + df_summary = pd.read_csv(file, sep="\t")[columns] # add by GTDB-Tk substracted file extension again to bin names (at least until changed consistently in rest of pipeline) - df_summary['user_genome'] = df_summary['user_genome'].astype(str) + '.' + args.extension - df_summary.set_index('user_genome', inplace=True) + df_summary["user_genome"] = df_summary["user_genome"].astype(str) + "." + args.extension + df_summary.set_index("user_genome", inplace=True) df_final = df_final.append(df_summary, verify_integrity=True) # For bins that were filtered out by GTDB-tk (e.g. due to insufficient number of AAs in MSA) filtered = [] if args.filtered_bins: for file in args.filtered_bins: - df = pd.read_csv(file, sep='\t', names=["bin_name", "reason"]) + df = pd.read_csv(file, sep="\t", names=["bin_name", "reason"]) for index, row in df.iterrows(): - bin_name = row['bin_name'] - bin_results = [bin_name, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA] + bin_name = row["bin_name"] + bin_results = [ + bin_name, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] filtered.append(bin_results) df_filtered = pd.DataFrame(filtered, columns=columns) - df_filtered['user_genome'] = df_filtered['user_genome'].astype(str) + '.' + args.extension - df_filtered.set_index('user_genome', inplace=True) + df_filtered["user_genome"] = df_filtered["user_genome"].astype(str) + "." + args.extension + df_filtered.set_index("user_genome", inplace=True) df_final = df_final.append(df_filtered, verify_integrity=True) # For bins for which GTDB-tk classification failed failed = [] if args.failed_bins: for file in args.failed_bins: - df = pd.read_csv(file, sep='\t', names=["bin_name", "reason"]) + df = pd.read_csv(file, sep="\t", names=["bin_name", "reason"]) for index, row in df.iterrows(): - bin_name = row['bin_name'] - bin_results = [bin_name, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA] + bin_name = row["bin_name"] + bin_results = [ + bin_name, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] failed.append(bin_results) df_failed = pd.DataFrame(failed, columns=columns) - df_failed['user_genome'] = df_failed['user_genome'].astype(str) + '.' + args.extension - df_failed.set_index('user_genome', inplace=True) + df_failed["user_genome"] = df_failed["user_genome"].astype(str) + "." + args.extension + df_failed.set_index("user_genome", inplace=True) df_final = df_final.append(df_failed, verify_integrity=True) # write output - df_final\ - .reset_index()\ - .rename(columns={"index": "user_genome"})\ - .to_csv(args.out, sep="\t", index=False) + df_final.reset_index().rename(columns={"index": "user_genome"}).to_csv(args.out, sep="\t", index=False) if __name__ == "__main__":