From 95c3b86b35c9a00f322e35cb46dcf3cfbf0437d6 Mon Sep 17 00:00:00 2001
From: Andrea Talenti <andrea.talenti@nanoporetech.com>
Date: Tue, 1 Aug 2023 16:29:09 +0000
Subject: [PATCH] CW-2506: Add adapted bcfstats loader from aplanat

---
 CHANGELOG.md                    |   4 +
 ezcharts/__init__.py            |   4 +-
 ezcharts/components/bcfstats.py | 281 ++++++++++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 2 deletions(-)
 create mode 100644 ezcharts/components/bcfstats.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f9e791..b0a5ab0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.6.3]
+### Added
+- bcfstats loader from aplanat.
+
 ## [v0.6.2]
 ### Added
 - ClinVar vcf table loader.
diff --git a/ezcharts/__init__.py b/ezcharts/__init__.py
index dc90adb..4f7a183 100644
--- a/ezcharts/__init__.py
+++ b/ezcharts/__init__.py
@@ -1,6 +1,6 @@
 """Simple eCharts API."""
 
-__version__ = "0.6.2"
+__version__ = "0.6.3"
 
 import argparse
 import importlib
@@ -36,7 +36,7 @@ def cli():
     # all component demos, plus some others
     components = [
         'params', 'nextclade', 'fastcat', 'dss',
-        'modkit', 'mosdepth', 'clinvar'
+        'modkit', 'mosdepth', 'clinvar', 'bcfstats'
         ]
     others = ['ezcharts.demo', 'ezcharts.plots', 'ezcharts.plots.ideogram']
 
diff --git a/ezcharts/components/bcfstats.py b/ezcharts/components/bcfstats.py
new file mode 100644
index 0000000..84a3237
--- /dev/null
+++ b/ezcharts/components/bcfstats.py
@@ -0,0 +1,281 @@
+"""Parse single and multiple outputs from bcftools stats into dataframes."""
+
+import argparse
+from collections import defaultdict
+import os
+import re
+
+import pandas as pd
+from pandas.api import types as pd_types
+
+from ezcharts.components.ezchart import EZChart
+from ezcharts.components.reports.comp import ComponentReport
+from ezcharts.layout.base import Snippet
+from ezcharts.layout.snippets import DataTable, Tabs
+from ezcharts.plots.categorical import barplot
+
+
+# Categorical types
+CATEGORICAL = pd_types.CategoricalDtype(ordered=True)
+
+# Multiple types structure for the different dataframes
+coltypes = {
+    'SN': {
+        'filename': CATEGORICAL,
+        'id': CATEGORICAL,
+        'MNPs': int,
+        'SNPs': int,
+        'indels': int,
+        'multiallelic SNP sites': int,
+        'multiallelic sites': int,
+        'no-ALTs': int,
+        'others': int,
+        'records': int,
+        'samples': int,
+    },
+    'TSTV': {
+        'filename': CATEGORICAL,
+        'id': CATEGORICAL,
+        'ts': int,
+        'tv': int,
+        'ts/tv': float,
+        'ts (1st ALT)': int,
+        'tv (1st ALT)': int,
+        'ts/tv (1st ALT)': float
+    },
+    'SiS': {
+        'filename': CATEGORICAL,
+        'id': CATEGORICAL,
+        'allele count': int,
+        'number of SNPs': int,
+        'number of transitions': int,
+        'number of transversions': int,
+        'number of indels': int,
+        'repeat-consistent': int,
+        'repeat-inconsistent': int,
+        'not applicable': int
+    },
+    'AF': {
+        'filename': CATEGORICAL,
+        'id': CATEGORICAL,
+        'allele frequency': float,
+        'number of SNPs': int,
+        'number of transitions': int,
+        'number of transversions': int,
+        'number of indels': int,
+        'repeat-consistent': int,
+        'repeat-inconsistent': int,
+        'not applicable': int
+    },
+    'QUAL': {
+        'filename': CATEGORICAL,
+        'id': CATEGORICAL,
+        'Quality': float,
+        'number of SNPs': int,
+        'number of transitions (1st ALT)': int,
+        'number of transversions (1st ALT)': int,
+        'number of indels': int
+    },
+    'IDD': {
+        'filename': CATEGORICAL,
+        'id': CATEGORICAL,
+        'length (deletions negative)': int,
+        'number of sites': int,
+        'number of genotypes': int,
+        'mean VAF': str   # mean VAF can be a '.' for single-sample VCFs, hence string
+    }
+}
+
+
+class BcftoolsStats(Snippet):
+    """Generate modified bases summary plots."""
+
+    def __init__(self, theme='epi2melabs', **kwargs):
+        """Create bcfstats summary report.
+
+        If multiple single-sample stats are provided, each will be
+        displayed in a distinct tab, with the exception of the summary table.
+
+        :param stats: A path to a single-sample bcftools stats file or a pandas
+            DataFrame from load_bcfstats
+        :param sample_name: A string or list of strings specifying the sample names;
+            it has to be of the same length as the input stats files
+        """
+        super().__init__(styles=None, classes=None)
+
+        with self:
+            # Make table of bcftools stats
+            if "stats" in kwargs:
+                if isinstance(kwargs["stats"], pd.DataFrame):
+                    all_bcfstats = kwargs["stats"]
+                else:
+                    all_bcfstats = load_bcfstats(
+                        kwargs["stats"], sample_names=kwargs["sample_names"])
+                # Check that the file is not empty
+                if len(all_bcfstats) == 0:
+                    raise pd.errors.EmptyDataError('Input stats file is empty')
+                # If one sample, save one plot
+                for stat, table in all_bcfstats.items():
+                    # Add base stats table
+                    if stat == 'SN':
+                        # we only got a single sample --> no dropdown
+                        DataTable.from_pandas(
+                            table, export=True, use_index=False)
+
+                    # Showcase indel length
+                    if stat == 'IDD':
+                        if len(table['id'].unique()) == 1:
+                            # we only got a single sample --> no dropdown
+                            plt = barplot(
+                                data=table,
+                                x="length (deletions negative)",
+                                y="number of sites")
+                            plt.xAxis.axisLabel.rotate = 90
+                            plt.xAxis.axisLabel.fontSize = 10
+                            EZChart(plt, 'epi2melabs')
+                        else:
+                            # several samples --> use a dropdown menu
+                            tabs = Tabs()
+                            with tabs.add_dropdown_menu():
+                                for fname, df_sample in table.groupby('id'):
+                                    with tabs.add_dropdown_tab(fname):
+                                        plt = barplot(
+                                            data=table,
+                                            x="length (deletions negative)",
+                                            y="number of sites")
+                                        plt.xAxis.axisLabel.rotate = 90
+                                        plt.xAxis.axisLabel.fontSize = 10
+                                        EZChart(plt, 'epi2melabs')
+
+
+def split_blocks(fname):
+    """Split lines of a file into blocks based on comment lines."""
+    comment = list()
+    data = list()
+    state = None
+    with open(fname, 'r') as fh:
+        # TODO: handle empty tables
+        for line in fh.readlines():
+            if line.startswith('#'):
+                if state != 'comment' and state is not None:
+                    yield comment, data
+                    comment, data = list(), list()
+                state = 'comment'
+                comment.append(line.strip('# ').rstrip())
+            else:
+                state = 'data'
+                data.append(line.rstrip())
+        yield comment, data
+
+
+def parse_bcftools_stats(fname, sample_name=None):
+    """Parse `bcftools stats` output.
+
+    :param fname: file to parse.
+    :param sample_name: name of the sample.
+    """
+    tables = dict()
+    filename = fname.split('/')[-1]
+    for comment, data in split_blocks(fname):
+        fields = [x.rstrip() for x in re.split(r'\[\d+\]', comment[-1])]
+        section, fields = fields[0], fields[1:]
+        rows = list()
+        for d in data:
+            items = d.split('\t')
+            if items[0] != section:
+                raise ValueError("first data field not equal to section key")
+            rows.append(items[1:])
+        tables[section] = pd.DataFrame(rows, columns=fields)
+        # Add file name as initial column in each of them
+        tables[section].insert(0, 'filename', filename)
+        # Add sample name if provided
+        if sample_name:
+            tables[section]['id'] = sample_name
+        else:
+            tables[section]['id'] = filename
+    # now some special handling
+    SN = tables['SN']
+    SN['key'] = SN['key'].apply(
+        lambda x: x.replace('number of ', '').rstrip(':'))
+    tables['SN'] = pd.DataFrame(
+        SN.pivot(index='id', columns='key', values='value').to_records())
+    # Add file name as initial column in each of them
+    tables['SN'].insert(0, 'filename', filename)
+    # Add sample name if provided
+    if sample_name:
+        tables['SN']['id'] = sample_name
+    else:
+        tables['SN']['id'] = filename
+    # Return tables
+    return tables
+
+
+def load_bcfstats(stats, sample_names=None):
+    """Parse multiple bcf stats outputs and combine.
+
+    :param fnames: list of filenames output by `bcftools stats.`
+    :param samples_names: list of names of each sample to add to the
+        dataframes.
+    """
+    # Check if it is one or more inputs
+    if os.path.isdir(stats):
+        filenames = [f"{stats}/{i}" for i in os.listdir(stats)]
+    elif os.path.isfile(stats):
+        filenames = [stats]
+    else:
+        raise Exception(f'No valid input: {stats}')
+
+    # Check sample names
+    if sample_names is not None:
+        if len(sample_names) != len(filenames):
+            raise TypeError(
+                "`filenames` and `sample_names` should be of equal length.")
+        zipped = zip(filenames, sample_names)
+        dfs = [
+            parse_bcftools_stats(filename, sample_name=sname)
+            for (filename, sname) in zipped]
+    else:
+        dfs = [parse_bcftools_stats(filename) for filename in filenames]
+
+    # Collect every table in one
+    all_tables = defaultdict(list)
+    for s_tables in dfs:
+        # we don't use the 'id' field - its for intersections
+        # and unions of two VCFs given to bcftools stats
+        for stat, table in s_tables.items():
+            all_tables[stat].append(table)
+    for stat, tables in all_tables.items():
+        if stat in coltypes:
+            all_tables[stat] = pd.concat(tables).astype(coltypes[stat])
+        else:
+            all_tables[stat] = pd.concat(tables).infer_objects()
+    return all_tables
+
+
+def main(args):
+    """Entry point to demonstrate the bcftools stats."""
+    comp_title = 'BCFtools stats'
+    sample_names = args.sample_name if args.sample_name else None
+    stats = BcftoolsStats(stats=args.stats, sample_names=sample_names)
+    report = ComponentReport(comp_title, stats)
+    report.write(args.output)
+
+
+def argparser():
+    """Argument parser for entrypoint."""
+    parser = argparse.ArgumentParser(
+        'BCFtools stats',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        add_help=False)
+    parser.add_argument(
+        "--stats",
+        required=True,
+        help="Input bcftools stats file.")
+    parser.add_argument(
+        "--sample_name",
+        help="Name of the sample; valid for a single sample.")
+    parser.add_argument(
+        "--output",
+        default="bcfstats_report.html",
+        help="Output HTML file.")
+    return parser