Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add micov bin for single genome of interest #18

Merged
merged 10 commits into from
Oct 20, 2024
102 changes: 102 additions & 0 deletions micov/_quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import numpy as np
import polars as pl


def make_csv_ready(df):
return df.with_columns(
(
pl.lit("[") +
pl.col(x)
.list.eval(pl.element().cast(str))
.list.join(",") +
pl.lit("]")
).alias(x)
for x, y in df.schema.items()
if y == pl.List(pl.String) or y == pl.List(pl.Int64)
)


def create_bin_list(genome_length, bin_num):
# note that bin_list is adjusted to 1-indexed to be compatible with pl.cut
bin_list_pos_stop = (
pl.Series("a", [0, genome_length], strict=False)
.hist(bin_count=bin_num)
.lazy()
.select(pl.col("breakpoint").alias("bin_stop"))
.with_row_index("bin_idx", offset=1)
)
bin_list_pos_start = (
pl.Series("a", [0, genome_length], strict=False)
.hist(bin_count=bin_num)
.lazy()
.select(pl.col("breakpoint").alias("bin_start"))
.with_row_index("bin_idx", offset=2)
)
bin_list = (
bin_list_pos_start.join(bin_list_pos_stop, on="bin_idx", how="right")
.fill_null(0)
.select([pl.col("bin_idx"), pl.col("bin_start"), pl.col("bin_stop")])
.with_columns(pl.col("bin_idx").cast(pl.Int64))
.collect()
sherlyn99 marked this conversation as resolved.
Show resolved Hide resolved
)
return bin_list


def pos_to_bins(pos, genome_length, bin_num):
pos = pos.lazy()
bin_list = create_bin_list(genome_length, bin_num).lazy()

# get start_bin_idx and stop_bin_idx
bin_edges = [0.0] + bin_list.select(
pl.col("bin_stop")
).collect().to_series().to_list()
cut_start = (
pos.select(pl.col("start"))
.collect()
.to_series()
.cut(
bin_edges,
labels=np.arange(len(bin_edges) + 1).astype(str),
left_closed=True,
)
.cast(pl.Int64)
.alias("start_bin_idx")
)
cut_stop = (
pos.select(pl.col("stop"))
.collect()
.to_series()
.cut(
bin_edges,
labels=np.arange(len(bin_edges) + 1).astype(str),
left_closed=False,
)
.cast(pl.Int64)
.alias("stop_bin_idx")
)
pos = pos.with_columns([cut_start, cut_stop])

# Update stop_bin_idx +1 for pl.arange and generate range of bins
pos = pos.with_columns(
(pl.col("stop_bin_idx") + 1).alias("stop_bin_idx_add1")
)

# Generate range of bins covered
pos = pos.with_columns(
pl.int_ranges("start_bin_idx", "stop_bin_idx_add1").alias("bin_idx")
).drop("stop_bin_idx_add1")

# Generate bin_df
bin_df = (
pos.explode("bin_idx")
.group_by("bin_idx")
.agg(
pl.col("start").len().alias("read_hits"),
pl.col("sample_id").n_unique().alias("sample_hits"),
pl.col("sample_id").unique().sort().alias("samples"),
)
.sort(by="bin_idx")
.join(bin_list, how="left", on="bin_idx")
)

return bin_df.collect(), pos.collect()
36 changes: 35 additions & 1 deletion micov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from ._per_sample import per_sample_coverage
from ._plot import per_sample_plots, single_sample_position_plot
from ._utils import logger
from ._constants import COLUMN_SAMPLE_ID
from ._constants import COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, BED_COV_SAMPLEID_SCHEMA
from ._quant import pos_to_bins, make_csv_ready


def _first_col_as_set(fp):
Expand Down Expand Up @@ -273,5 +274,38 @@ def _load_db(dbbase, sample_metadata, features_to_keep):
FROM coverage)""")


@cli.command()
@click.option('-pos', '--covered-positions', type=click.Path(exists=True),
required=True,
help='Covered positions calculated from one or more samples')
@click.option('-o', '--outdir', type=click.Path(exists=True),
required=True, help="Output directory. If new, will be created.")
@click.option('-g', '--genome-id', type=str,
required=True, help="Genome ID of the genome of interest")
@click.option('-l', '--genome-length', type=int,
required=True, help="Length of the genome of interest")
@click.option('-n', '--bin-num', type=int, default=1000,
required=False, help="Number of bins")
def binning(covered_positions, outdir, genome_id, genome_length, bin_num):
"""Bin genome positions and quantify read and sample hits across bins."""
pos = pl.read_csv(covered_positions, separator='\t',
new_columns=BED_COV_SAMPLEID_SCHEMA.columns,
schema_overrides=BED_COV_SAMPLEID_SCHEMA.dtypes_dict,
has_header=False, skip_rows=1).lazy()
wasade marked this conversation as resolved.
Show resolved Hide resolved
pos = pos.filter(pl.col(COLUMN_GENOME_ID).is_in([genome_id]))

bin_df, pos_updated = pos_to_bins(pos, genome_length, bin_num)

if not os.path.exists(outdir):
os.makedirs(outdir)

bin_df = make_csv_ready(bin_df)
pos_updated = make_csv_ready(pos_updated)

bin_df.write_csv(f"{outdir}/bin_stats.tsv", separator="\t", include_header=True)
pos_updated.write_csv(f"{outdir}/pos_binned.tsv", separator="\t",
include_header=True)


if __name__ == '__main__':
cli()
Loading
Loading