From 95c3b86b35c9a00f322e35cb46dcf3cfbf0437d6 Mon Sep 17 00:00:00 2001 From: Andrea Talenti Date: Tue, 1 Aug 2023 16:29:09 +0000 Subject: [PATCH] CW-2506: Add adapted bcfstats loader from aplanat --- CHANGELOG.md | 4 + ezcharts/__init__.py | 4 +- ezcharts/components/bcfstats.py | 281 ++++++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 ezcharts/components/bcfstats.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f9e791..b0a5ab0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.6.3] +### Added +- bcfstats loader from aplanat. + ## [v0.6.2] ### Added - ClinVar vcf table loader. diff --git a/ezcharts/__init__.py b/ezcharts/__init__.py index dc90adb..4f7a183 100644 --- a/ezcharts/__init__.py +++ b/ezcharts/__init__.py @@ -1,6 +1,6 @@ """Simple eCharts API.""" -__version__ = "0.6.2" +__version__ = "0.6.3" import argparse import importlib @@ -36,7 +36,7 @@ def cli(): # all component demos, plus some others components = [ 'params', 'nextclade', 'fastcat', 'dss', - 'modkit', 'mosdepth', 'clinvar' + 'modkit', 'mosdepth', 'clinvar', 'bcfstats' ] others = ['ezcharts.demo', 'ezcharts.plots', 'ezcharts.plots.ideogram'] diff --git a/ezcharts/components/bcfstats.py b/ezcharts/components/bcfstats.py new file mode 100644 index 0000000..84a3237 --- /dev/null +++ b/ezcharts/components/bcfstats.py @@ -0,0 +1,281 @@ +"""Parse single and multiple outputs from bcftools stats into dataframes.""" + +import argparse +from collections import defaultdict +import os +import re + +import pandas as pd +from pandas.api import types as pd_types + +from ezcharts.components.ezchart import EZChart +from ezcharts.components.reports.comp import ComponentReport +from ezcharts.layout.base import Snippet +from ezcharts.layout.snippets import DataTable, Tabs +from ezcharts.plots.categorical import barplot + + +# Categorical types +CATEGORICAL = pd_types.CategoricalDtype(ordered=True) + +# Multiple types structure for the different dataframes +coltypes = { + 'SN': { + 'filename': CATEGORICAL, + 'id': CATEGORICAL, + 'MNPs': int, + 'SNPs': int, + 'indels': int, + 'multiallelic SNP sites': int, + 'multiallelic sites': int, + 'no-ALTs': int, + 'others': int, + 'records': int, + 'samples': int, + }, + 'TSTV': { + 'filename': CATEGORICAL, + 'id': CATEGORICAL, + 'ts': int, + 'tv': int, + 'ts/tv': float, + 'ts (1st ALT)': int, + 'tv (1st ALT)': int, + 'ts/tv (1st ALT)': float + }, + 'SiS': { + 'filename': CATEGORICAL, + 'id': CATEGORICAL, + 'allele count': int, + 'number of SNPs': int, + 'number of transitions': int, + 'number of transversions': int, + 'number of indels': int, + 'repeat-consistent': int, + 'repeat-inconsistent': int, + 'not applicable': int + }, + 'AF': { + 'filename': CATEGORICAL, + 'id': CATEGORICAL, + 'allele frequency': float, + 'number of SNPs': int, + 'number of transitions': int, + 'number of transversions': int, + 'number of indels': int, + 'repeat-consistent': int, + 'repeat-inconsistent': int, + 'not applicable': int + }, + 'QUAL': { + 'filename': CATEGORICAL, + 'id': CATEGORICAL, + 'Quality': float, + 'number of SNPs': int, + 'number of transitions (1st ALT)': int, + 'number of transversions (1st ALT)': int, + 'number of indels': int + }, + 'IDD': { + 'filename': CATEGORICAL, + 'id': CATEGORICAL, + 'length (deletions negative)': int, + 'number of sites': int, + 'number of genotypes': int, + 'mean VAF': str # mean VAF can be a '.' for single-sample VCFs, hence string + } +} + + +class BcftoolsStats(Snippet): + """Generate modified bases summary plots.""" + + def __init__(self, theme='epi2melabs', **kwargs): + """Create bcfstats summary report. + + If multiple single-sample stats are provided, each will be + displayed in a distinct tab, with the exception of the summary table. + + :param stats: A path to a single-sample bcftools stats file or a pandas + DataFrame from load_bcfstats + :param sample_name: A string or list of strings specifying the sample names; + it has to be of the same length as the input stats files + """ + super().__init__(styles=None, classes=None) + + with self: + # Make table of bcftools stats + if "stats" in kwargs: + if isinstance(kwargs["stats"], pd.DataFrame): + all_bcfstats = kwargs["stats"] + else: + all_bcfstats = load_bcfstats( + kwargs["stats"], sample_names=kwargs["sample_names"]) + # Check that the file is not empty + if len(all_bcfstats) == 0: + raise pd.errors.EmptyDataError('Input stats file is empty') + # If one sample, save one plot + for stat, table in all_bcfstats.items(): + # Add base stats table + if stat == 'SN': + # we only got a single sample --> no dropdown + DataTable.from_pandas( + table, export=True, use_index=False) + + # Showcase indel length + if stat == 'IDD': + if len(table['id'].unique()) == 1: + # we only got a single sample --> no dropdown + plt = barplot( + data=table, + x="length (deletions negative)", + y="number of sites") + plt.xAxis.axisLabel.rotate = 90 + plt.xAxis.axisLabel.fontSize = 10 + EZChart(plt, 'epi2melabs') + else: + # several samples --> use a dropdown menu + tabs = Tabs() + with tabs.add_dropdown_menu(): + for fname, df_sample in table.groupby('id'): + with tabs.add_dropdown_tab(fname): + plt = barplot( + data=table, + x="length (deletions negative)", + y="number of sites") + plt.xAxis.axisLabel.rotate = 90 + plt.xAxis.axisLabel.fontSize = 10 + EZChart(plt, 'epi2melabs') + + +def split_blocks(fname): + """Split lines of a file into blocks based on comment lines.""" + comment = list() + data = list() + state = None + with open(fname, 'r') as fh: + # TODO: handle empty tables + for line in fh.readlines(): + if line.startswith('#'): + if state != 'comment' and state is not None: + yield comment, data + comment, data = list(), list() + state = 'comment' + comment.append(line.strip('# ').rstrip()) + else: + state = 'data' + data.append(line.rstrip()) + yield comment, data + + +def parse_bcftools_stats(fname, sample_name=None): + """Parse `bcftools stats` output. + + :param fname: file to parse. + :param sample_name: name of the sample. + """ + tables = dict() + filename = fname.split('/')[-1] + for comment, data in split_blocks(fname): + fields = [x.rstrip() for x in re.split(r'\[\d+\]', comment[-1])] + section, fields = fields[0], fields[1:] + rows = list() + for d in data: + items = d.split('\t') + if items[0] != section: + raise ValueError("first data field not equal to section key") + rows.append(items[1:]) + tables[section] = pd.DataFrame(rows, columns=fields) + # Add file name as initial column in each of them + tables[section].insert(0, 'filename', filename) + # Add sample name if provided + if sample_name: + tables[section]['id'] = sample_name + else: + tables[section]['id'] = filename + # now some special handling + SN = tables['SN'] + SN['key'] = SN['key'].apply( + lambda x: x.replace('number of ', '').rstrip(':')) + tables['SN'] = pd.DataFrame( + SN.pivot(index='id', columns='key', values='value').to_records()) + # Add file name as initial column in each of them + tables['SN'].insert(0, 'filename', filename) + # Add sample name if provided + if sample_name: + tables['SN']['id'] = sample_name + else: + tables['SN']['id'] = filename + # Return tables + return tables + + +def load_bcfstats(stats, sample_names=None): + """Parse multiple bcf stats outputs and combine. + + :param fnames: list of filenames output by `bcftools stats.` + :param samples_names: list of names of each sample to add to the + dataframes. + """ + # Check if it is one or more inputs + if os.path.isdir(stats): + filenames = [f"{stats}/{i}" for i in os.listdir(stats)] + elif os.path.isfile(stats): + filenames = [stats] + else: + raise Exception(f'No valid input: {stats}') + + # Check sample names + if sample_names is not None: + if len(sample_names) != len(filenames): + raise TypeError( + "`filenames` and `sample_names` should be of equal length.") + zipped = zip(filenames, sample_names) + dfs = [ + parse_bcftools_stats(filename, sample_name=sname) + for (filename, sname) in zipped] + else: + dfs = [parse_bcftools_stats(filename) for filename in filenames] + + # Collect every table in one + all_tables = defaultdict(list) + for s_tables in dfs: + # we don't use the 'id' field - its for intersections + # and unions of two VCFs given to bcftools stats + for stat, table in s_tables.items(): + all_tables[stat].append(table) + for stat, tables in all_tables.items(): + if stat in coltypes: + all_tables[stat] = pd.concat(tables).astype(coltypes[stat]) + else: + all_tables[stat] = pd.concat(tables).infer_objects() + return all_tables + + +def main(args): + """Entry point to demonstrate the bcftools stats.""" + comp_title = 'BCFtools stats' + sample_names = args.sample_name if args.sample_name else None + stats = BcftoolsStats(stats=args.stats, sample_names=sample_names) + report = ComponentReport(comp_title, stats) + report.write(args.output) + + +def argparser(): + """Argument parser for entrypoint.""" + parser = argparse.ArgumentParser( + 'BCFtools stats', + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + add_help=False) + parser.add_argument( + "--stats", + required=True, + help="Input bcftools stats file.") + parser.add_argument( + "--sample_name", + help="Name of the sample; valid for a single sample.") + parser.add_argument( + "--output", + default="bcfstats_report.html", + help="Output HTML file.") + return parser