Merge pull request #18 from sherlyn99/contamination_identification

biocore · Oct 20, 2024 · 7f79d27 · 7f79d27
2 parents 5a8d254 + 61dd298
commit 7f79d27
Show file tree

Hide file tree

Showing 3 changed files with 438 additions and 1 deletion.
diff --git a/micov/_quant.py b/micov/_quant.py
@@ -0,0 +1,100 @@
+import numpy as np
+import polars as pl
+
+
+def make_csv_ready(df):
+    return df.with_columns(
+        (
+            pl.lit("[")
+            + pl.col(x).list.eval(pl.element().cast(str)).list.join(",")
+            + pl.lit("]")
+        ).alias(x)
+        for x, y in df.schema.items()
+        if y == pl.List(pl.String) or y == pl.List(pl.Int64)
+    )
+
+
+def create_bin_list(genome_length, bin_num):
+    # note that bin_list is adjusted to 1-indexed to be compatible with pl.cut
+    bin_list_pos_stop = (
+        pl.Series("a", [0, genome_length], strict=False)
+        .hist(bin_count=bin_num)
+        .lazy()
+        .select(pl.col("breakpoint").alias("bin_stop"))
+        .with_row_index("bin_idx", offset=1)
+    )
+    bin_list_pos_start = (
+        pl.Series("a", [0, genome_length], strict=False)
+        .hist(bin_count=bin_num)
+        .lazy()
+        .select(pl.col("breakpoint").alias("bin_start"))
+        .with_row_index("bin_idx", offset=2)
+    )
+    bin_list = (
+        bin_list_pos_start.join(bin_list_pos_stop, on="bin_idx", how="right")
+        .fill_null(0)
+        .select([pl.col("bin_idx"), pl.col("bin_start"), pl.col("bin_stop")])
+        .with_columns(pl.col("bin_idx").cast(pl.Int64))
+        .lazy()
+    )
+    return bin_list
+
+
+def pos_to_bins(pos, genome_length, bin_num):
+    pos = pos.lazy()
+    bin_list = create_bin_list(genome_length, bin_num)
+
+    # get start_bin_idx and stop_bin_idx
+    bin_edges = [0.0] + bin_list.select(
+        pl.col("bin_stop")
+    ).collect().to_series().to_list()
+    cut_start = (
+        pos.select(pl.col("start"))
+        .collect()
+        .to_series()
+        .cut(
+            bin_edges,
+            labels=np.arange(len(bin_edges) + 1).astype(str),
+            left_closed=True,
+        )
+        .cast(pl.Int64)
+        .alias("start_bin_idx")
+    )
+    cut_stop = (
+        pos.select(pl.col("stop"))
+        .collect()
+        .to_series()
+        .cut(
+            bin_edges,
+            labels=np.arange(len(bin_edges) + 1).astype(str),
+            left_closed=False,
+        )
+        .cast(pl.Int64)
+        .alias("stop_bin_idx")
+    )
+    pos = pos.with_columns([cut_start, cut_stop])
+
+    # Update stop_bin_idx +1 for pl.arange and generate range of bins
+    pos = pos.with_columns(
+        (pl.col("stop_bin_idx") + 1).alias("stop_bin_idx_add1")
+    )
+
+    # Generate range of bins covered
+    pos = pos.with_columns(
+        pl.int_ranges("start_bin_idx", "stop_bin_idx_add1").alias("bin_idx")
+    ).drop("stop_bin_idx_add1")
+
+    # Generate bin_df
+    bin_df = (
+        pos.explode("bin_idx")
+        .group_by("bin_idx")
+        .agg(
+            pl.col("start").len().alias("read_hits"),
+            pl.col("sample_id").n_unique().alias("sample_hits"),
+            pl.col("sample_id").unique().sort().alias("samples"),
+        )
+        .sort(by="bin_idx")
+        .join(bin_list, how="left", on="bin_idx")
+    )
+
+    return bin_df.collect(), pos.collect()
diff --git a/micov/cli.py b/micov/cli.py
@@ -16,7 +16,8 @@
 from ._per_sample import per_sample_coverage
 from ._plot import per_sample_plots, single_sample_position_plot
 from ._utils import logger
-from ._constants import COLUMN_SAMPLE_ID
+from ._constants import COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, BED_COV_SAMPLEID_SCHEMA
+from ._quant import pos_to_bins, make_csv_ready
 
 
 def _first_col_as_set(fp):
@@ -272,5 +273,38 @@ def _load_db(dbbase, sample_metadata, features_to_keep):
                                               FROM coverage)""")
 
 
+@cli.command()
+@click.option('-pos', '--covered-positions', type=click.Path(exists=True),
+              required=True,
+              help='Covered positions calculated from one or more samples')
+@click.option('-o', '--outdir', type=click.Path(exists=True),
+              required=True, help="Output directory. If new, will be created.")
+@click.option('-g', '--genome-id', type=str,
+              required=True, help="Genome ID of the genome of interest")
+@click.option('-l', '--genome-length', type=int,
+              required=True, help="Length of the genome of interest")
+@click.option('-n', '--bin-num', type=int, default=1000,
+              required=False, help="Number of bins")
+def binning(covered_positions, outdir, genome_id, genome_length, bin_num):
+    """Bin genome positions and quantify read and sample hits across bins."""
+    pos = pl.read_csv(covered_positions, separator='\t',
+                      new_columns=BED_COV_SAMPLEID_SCHEMA.columns,
+                      schema_overrides=BED_COV_SAMPLEID_SCHEMA.dtypes_dict,
+                      has_header=False, skip_rows=1).lazy()
+    pos = pos.filter(pl.col(COLUMN_GENOME_ID).is_in([genome_id]))
+
+    bin_df, pos_updated = pos_to_bins(pos, genome_length, bin_num)
+
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    bin_df = make_csv_ready(bin_df)
+    pos_updated = make_csv_ready(pos_updated)
+
+    bin_df.write_csv(f"{outdir}/bin_stats.tsv", separator="\t", include_header=True)
+    pos_updated.write_csv(f"{outdir}/pos_binned.tsv", separator="\t",
+                          include_header=True)
+
+
 if __name__ == '__main__':
     cli()