Skip to content

Commit

Permalink
Merge pull request #18 from sherlyn99/contamination_identification
Browse files Browse the repository at this point in the history
  • Loading branch information
wasade authored Oct 20, 2024
2 parents 5a8d254 + 61dd298 commit 7f79d27
Show file tree
Hide file tree
Showing 3 changed files with 438 additions and 1 deletion.
100 changes: 100 additions & 0 deletions micov/_quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
import polars as pl


def make_csv_ready(df):
return df.with_columns(
(
pl.lit("[")
+ pl.col(x).list.eval(pl.element().cast(str)).list.join(",")
+ pl.lit("]")
).alias(x)
for x, y in df.schema.items()
if y == pl.List(pl.String) or y == pl.List(pl.Int64)
)


def create_bin_list(genome_length, bin_num):
# note that bin_list is adjusted to 1-indexed to be compatible with pl.cut
bin_list_pos_stop = (
pl.Series("a", [0, genome_length], strict=False)
.hist(bin_count=bin_num)
.lazy()
.select(pl.col("breakpoint").alias("bin_stop"))
.with_row_index("bin_idx", offset=1)
)
bin_list_pos_start = (
pl.Series("a", [0, genome_length], strict=False)
.hist(bin_count=bin_num)
.lazy()
.select(pl.col("breakpoint").alias("bin_start"))
.with_row_index("bin_idx", offset=2)
)
bin_list = (
bin_list_pos_start.join(bin_list_pos_stop, on="bin_idx", how="right")
.fill_null(0)
.select([pl.col("bin_idx"), pl.col("bin_start"), pl.col("bin_stop")])
.with_columns(pl.col("bin_idx").cast(pl.Int64))
.lazy()
)
return bin_list


def pos_to_bins(pos, genome_length, bin_num):
pos = pos.lazy()
bin_list = create_bin_list(genome_length, bin_num)

# get start_bin_idx and stop_bin_idx
bin_edges = [0.0] + bin_list.select(
pl.col("bin_stop")
).collect().to_series().to_list()
cut_start = (
pos.select(pl.col("start"))
.collect()
.to_series()
.cut(
bin_edges,
labels=np.arange(len(bin_edges) + 1).astype(str),
left_closed=True,
)
.cast(pl.Int64)
.alias("start_bin_idx")
)
cut_stop = (
pos.select(pl.col("stop"))
.collect()
.to_series()
.cut(
bin_edges,
labels=np.arange(len(bin_edges) + 1).astype(str),
left_closed=False,
)
.cast(pl.Int64)
.alias("stop_bin_idx")
)
pos = pos.with_columns([cut_start, cut_stop])

# Update stop_bin_idx +1 for pl.arange and generate range of bins
pos = pos.with_columns(
(pl.col("stop_bin_idx") + 1).alias("stop_bin_idx_add1")
)

# Generate range of bins covered
pos = pos.with_columns(
pl.int_ranges("start_bin_idx", "stop_bin_idx_add1").alias("bin_idx")
).drop("stop_bin_idx_add1")

# Generate bin_df
bin_df = (
pos.explode("bin_idx")
.group_by("bin_idx")
.agg(
pl.col("start").len().alias("read_hits"),
pl.col("sample_id").n_unique().alias("sample_hits"),
pl.col("sample_id").unique().sort().alias("samples"),
)
.sort(by="bin_idx")
.join(bin_list, how="left", on="bin_idx")
)

return bin_df.collect(), pos.collect()
36 changes: 35 additions & 1 deletion micov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from ._per_sample import per_sample_coverage
from ._plot import per_sample_plots, single_sample_position_plot
from ._utils import logger
from ._constants import COLUMN_SAMPLE_ID
from ._constants import COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, BED_COV_SAMPLEID_SCHEMA
from ._quant import pos_to_bins, make_csv_ready


def _first_col_as_set(fp):
Expand Down Expand Up @@ -272,5 +273,38 @@ def _load_db(dbbase, sample_metadata, features_to_keep):
FROM coverage)""")


@cli.command()
@click.option('-pos', '--covered-positions', type=click.Path(exists=True),
required=True,
help='Covered positions calculated from one or more samples')
@click.option('-o', '--outdir', type=click.Path(exists=True),
required=True, help="Output directory. If new, will be created.")
@click.option('-g', '--genome-id', type=str,
required=True, help="Genome ID of the genome of interest")
@click.option('-l', '--genome-length', type=int,
required=True, help="Length of the genome of interest")
@click.option('-n', '--bin-num', type=int, default=1000,
required=False, help="Number of bins")
def binning(covered_positions, outdir, genome_id, genome_length, bin_num):
"""Bin genome positions and quantify read and sample hits across bins."""
pos = pl.read_csv(covered_positions, separator='\t',
new_columns=BED_COV_SAMPLEID_SCHEMA.columns,
schema_overrides=BED_COV_SAMPLEID_SCHEMA.dtypes_dict,
has_header=False, skip_rows=1).lazy()
pos = pos.filter(pl.col(COLUMN_GENOME_ID).is_in([genome_id]))

bin_df, pos_updated = pos_to_bins(pos, genome_length, bin_num)

if not os.path.exists(outdir):
os.makedirs(outdir)

bin_df = make_csv_ready(bin_df)
pos_updated = make_csv_ready(pos_updated)

bin_df.write_csv(f"{outdir}/bin_stats.tsv", separator="\t", include_header=True)
pos_updated.write_csv(f"{outdir}/pos_binned.tsv", separator="\t",
include_header=True)


if __name__ == '__main__':
cli()
Loading

0 comments on commit 7f79d27

Please sign in to comment.