Skip to content

Commit

Permalink
Merge pull request #19 from biocore/unscaled_cumulative
Browse files Browse the repository at this point in the history
Unscaled cumulative plots, and cumulative with a null curve via Monte
  • Loading branch information
wasade authored Jan 29, 2025
2 parents 7f79d27 + 107b332 commit 90ab453
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 17 deletions.
147 changes: 132 additions & 15 deletions micov/_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def ordered_coverage(coverage, grp, target):
.filter(pl.col(COLUMN_GENOME_ID) == target)
.sort(COLUMN_PERCENT_COVERED)
.with_row_index()
.with_columns(x=pl.col('index') / pl.len())).collect()
.with_columns(x=pl.col('index') / pl.len(),
x_unscaled=pl.col('index'))).collect()


def slice_positions(positions, id_):
Expand All @@ -40,6 +41,119 @@ def per_sample_plots(all_coverage, all_covered_positions, metadata,
sample_metadata_column, output, scale=10000)


def per_sample_plots_monte(all_coverage, all_covered_positions, metadata,
sample_metadata_column, output, target_lookup, iters):
for genome in all_coverage[COLUMN_GENOME_ID].unique():
target_name = target_lookup[genome]
cumulative_monte(metadata, all_coverage, all_covered_positions, genome,
sample_metadata_column, output, target_name, iters)


def compute_cumulative(coverage, grp, target, target_positions, lengths):
current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat)
grp_coverage = ordered_coverage(coverage, grp, target)

if len(grp_coverage) == 0:
return None, None

cur_y = []
cur_x = grp_coverage['x_unscaled']
for id_ in grp_coverage[COLUMN_SAMPLE_ID]:
next_ = slice_positions(target_positions, id_).collect()
current = compress(pl.concat([current, next_]))
per_cov = coverage_percent(current, lengths).collect()
cur_y.append(per_cov[COLUMN_PERCENT_COVERED].item(0))
return cur_x, cur_y


def cumulative_monte(metadata, coverage, positions, target, variable, output,
target_name, iters):
plt.figure(figsize=(12, 8))
labels = []

target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target)
coverage = coverage.filter(pl.col(COLUMN_GENOME_ID) == target)
cov_samples = coverage.select(pl.col(COLUMN_SAMPLE_ID).unique())
metadata = metadata.filter(pl.col(COLUMN_SAMPLE_ID).is_in(cov_samples))

if len(target_positions) == 0:
raise ValueError()

if len(coverage) == 0:
raise ValueError()

lengths = coverage[[COLUMN_GENOME_ID, COLUMN_LENGTH]].unique()

if len(lengths) > 1:
raise ValueError()

length = lengths[COLUMN_LENGTH].item(0)
value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
max_n = 0
for name, color in zip(value_order, range(0, 10)):
color = f'C{color}'

grp = metadata.filter(pl.col(variable) == name)

n = len(grp)
if n < 10:
continue
max_n = max(n, max_n)

cur_x, cur_y = compute_cumulative(coverage, grp, target,
target_positions, lengths)
if cur_x is None:
continue
else:
labels.append(f"{name} (n={len(cur_x)})")
plt.plot(cur_x, cur_y, color=color)

if not labels:
return

monte_y = []
for it in range(iters):
monte = (metadata.select(pl.col(COLUMN_SAMPLE_ID)
.shuffle())
.head(max_n))
grp_monte = metadata.filter(pl.col(COLUMN_SAMPLE_ID)
.is_in(monte))
monte_x, cur_y = compute_cumulative(coverage, grp_monte, target,
target_positions, lengths)
monte_y.append(cur_y)

monte_y = np.asarray(monte_y)
median = np.median(monte_y, axis=0)
std = np.std(monte_y, axis=0)

plt.plot(monte_x, median, color='k', linestyle='dotted', linewidth=1,
alpha=0.6)
plt.plot(monte_x, median + std, color='k', linestyle='--', linewidth=1,
alpha=0.6)
plt.plot(monte_x, median - std, color='k', linestyle='--', linewidth=1,
alpha=0.6)
plt.fill_between(monte_x, median - std, median + std, color='k',
alpha=0.1)
labels.append(f'Monte Carlo (n={len(monte_x)})')

ax = plt.gca()
ax.set_ylabel('Percent genome covered', fontsize=16)
ax.set_xlabel('Within group sample rank by coverage', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
ax.set_xlim(0, max_n - 1)
ax.set_ylim(0, 100)
ax.set_title((f'Cumulative: {target_name}({target}) '
f'({length}bp)'), fontsize=16)
plt.legend(labels, fontsize=14)

plt.tight_layout()
plt.savefig(f'{output}.{target_name}.{target}.{variable}.cumulative-monte.png')
plt.close()


def cumulative(metadata, coverage, positions, target, variable, output):
plt.figure(figsize=(12, 8))
labels = []
Expand All @@ -61,23 +175,19 @@ def cumulative(metadata, coverage, positions, target, variable, output):

length = lengths[COLUMN_LENGTH].item(0)

for (name, ), grp in metadata.group_by([variable, ], maintain_order=True):
value_order = metadata.select(pl.col(variable).unique().sort())[variable]
for name in value_order:
grp = metadata.filter(pl.col(variable) == name)
current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat)

grp_coverage = ordered_coverage(coverage, grp, target)

if len(grp_coverage) == 0:
cur_x, cur_y = compute_cumulative(coverage, grp, target,
target_positions, lengths)
if cur_x is None:
continue
labels.append(name)

cur_y = []
cur_x = grp_coverage['x']
for id_ in grp_coverage[COLUMN_SAMPLE_ID]:
next_ = slice_positions(target_positions, id_).collect()
current = compress(pl.concat([current, next_]))
per_cov = coverage_percent(current, lengths).collect()
cur_y.append(per_cov[COLUMN_PERCENT_COVERED].item(0))

covs.append(cur_y)
plt.plot(cur_x, cur_y)

Expand Down Expand Up @@ -108,7 +218,11 @@ def non_cumulative(metadata, coverage, target, variable, output):
labels = []
covs = []

for (name, ), grp in metadata.group_by([variable, ], maintain_order=True):
value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
for name in value_order:
grp = metadata.filter(pl.col(variable) == name)
grp_coverage = ordered_coverage(coverage, grp, target)

if len(grp_coverage) == 0:
Expand Down Expand Up @@ -190,9 +304,11 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale
colors = []
target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target).lazy()

for ((name, ), grp), color in zip(metadata.group_by([variable, ],
maintain_order=True),
range(0, 10)):
value_order = metadata.select(pl.col(variable)
.unique()
.sort())[variable]
for name, color in zip(value_order, range(0, 10)):
grp = metadata.filter(pl.col(variable) == name)
color = f'C{color}'
grp_coverage = ordered_coverage(coverage, grp, target)

Expand Down Expand Up @@ -254,6 +370,7 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale
leg = ax.get_legend()
for i, lh in enumerate(leg.legendHandles):
lh.set_color(colors[i])
lh._sizes = [5.0, ]

ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
Expand Down
60 changes: 58 additions & 2 deletions micov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
from ._cov import coverage_percent
from ._convert import cigar_to_lens
from ._per_sample import per_sample_coverage
from ._plot import per_sample_plots, single_sample_position_plot
from ._plot import (per_sample_plots, per_sample_plots_monte,
single_sample_position_plot)
from ._utils import logger
from ._constants import COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, BED_COV_SAMPLEID_SCHEMA
from ._constants import (COLUMN_SAMPLE_ID, COLUMN_GENOME_ID,
BED_COV_SAMPLEID_SCHEMA,
COLUMN_START, COLUMN_CIGAR, COLUMN_STOP)
from ._quant import pos_to_bins, make_csv_ready


Expand Down Expand Up @@ -243,6 +246,59 @@ def per_sample_group(parquet_coverage, sample_metadata, sample_metadata_column,
sample_metadata_column, output)


@cli.command()
@click.option('--parquet-coverage', type=click.Path(exists=False),
required=True, help=('Pre-computed coverage data as parquet. '
'This should be the basename used, i.e. '
'for "foo.coverage.parquet", please use '
'"foo"'))
@click.option('--sample-metadata', type=click.Path(exists=True),
required=True,
help='A metadata file with the sample metadata')
@click.option('--sample-metadata-column', type=str,
required=True,
help='The column to consider in the sample metadata')
@click.option('--features-to-keep', type=click.Path(exists=True),
required=False,
help='A metadata file with the features to keep')
@click.option('--iters', type=int, default=10, required=False)
@click.option('--target-names', type=str, required=False)
@click.option('--output', type=click.Path(exists=False), required=True)
@click.option('--plot', is_flag=True, default=False,
help='Generate plots from features')
def per_sample_monte(parquet_coverage, sample_metadata, sample_metadata_column,
features_to_keep, output, plot, iters, target_names):
"""Generate sample group plots and coverage data with a null curve."""
_load_db(parquet_coverage, sample_metadata, features_to_keep)

all_covered_positions = duckdb.sql("SELECT * from covered_positions").pl()
all_coverage = duckdb.sql("SELECT * FROM coverage").pl()
metadata_pl = duckdb.sql("SELECT * FROM metadata").pl()

if target_names is not None:
target_names = dict(pl.scan_csv(target_names,
separator='\t',
new_columns=['feature-id', 'lineage'],
has_header=False)
.with_columns(pl.col('lineage')
.str
.split(';')
.list
.get(-1)
.str
.replace_all(r" |\[|\]", "_")
.alias('species'))
.select('feature-id', 'species')
.collect()
.iter_rows())
else:
sql = "SELECT DISTINCT genome_id FROM coverage"
target_names = {k[0]: k[0] for k in duckdb.sql(sql).fetchall()}

per_sample_plots_monte(all_coverage, all_covered_positions, metadata_pl,
sample_metadata_column, output, target_names, iters)


def _load_db(dbbase, sample_metadata, features_to_keep):
metadata_pl = parse_sample_metadata(sample_metadata)
sample_column = metadata_pl.columns[0]
Expand Down

0 comments on commit 90ab453

Please sign in to comment.