Skip to content

Commit

Permalink
switch to lazy streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
AroneyS committed Apr 10, 2024
1 parent b4118dc commit 7ac7ce8
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions binchicken/workflow/scripts/target_elusive.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_clusters(
chosen_samples = [(samples[i], list(np.array(samples)[b[b != i]])) for i, b in enumerate(best_samples)]

sample_combinations = (
pl.DataFrame({"cluster_size": range(1, MAX_COASSEMBLY_SAMPLES)})
pl.LazyFrame({"cluster_size": range(1, MAX_COASSEMBLY_SAMPLES)})
.with_columns(
sample_combinations = pl.col("cluster_size").map_elements(
lambda x: [i for i in itertools.combinations(range(PRECLUSTER_SIZE-1), x)],
Expand All @@ -49,7 +49,7 @@ def get_clusters(
logging.info("Choosing preclusters based on distances")
with pl.StringCache():
preclusters = (
pl.DataFrame(chosen_samples, schema={"sample": pl.Categorical, "samples": pl.List(pl.Categorical)})
pl.LazyFrame(chosen_samples, schema={"sample": pl.Categorical, "samples": pl.List(pl.Categorical)})
.with_columns(length = pl.col("samples").list.len())
.join(sample_combinations, how="cross")
.with_columns(pl.col("samples").list.gather(pl.col("sample_combinations")))
Expand All @@ -60,6 +60,7 @@ def get_clusters(
.list.join(",")
)
.unique()
.collect(streaming=True)
)

logging.info(f"Found {preclusters.height} preclusters")
Expand Down

0 comments on commit 7ac7ce8

Please sign in to comment.