Skip to content

Commit

Permalink
fix: Remove empty data before fitting
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianMichelsen committed Oct 17, 2022
1 parent 5b370dc commit 94d7975
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ data/
data*/
real-data*/
old-real-data*/
performance/
raw_data/
!raw_data/alignment.bam
!raw_data/names-mdmg.dmp
Expand Down
22 changes: 22 additions & 0 deletions src/metaDMG/fit/fits.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,19 @@ def filter_k_sum(
return df_mismatches.query("k_sum_total > 0")


def filter_max_N_in_group(
config: Config,
df_mismatches: pd.DataFrame,
) -> pd.DataFrame:

# filter out tax_id's with 0 k_sum_total
tax_ids_to_drop = set(df_mismatches.query("max_N_in_group == 0")["tax_id"].unique())
if len(tax_ids_to_drop) > 0:
logger.debug(f"Dropping the following Tax IDs since max_N_in_group == 0:")
logger.debug(tax_ids_to_drop)
return df_mismatches.query("max_N_in_group > 0")


#%%


Expand All @@ -569,6 +582,15 @@ def compute(config, df_mismatches):
logger.debug(s)
raise BadDataError(s)

# filter out tax_id's with no data in them (max_N_in_group == 0)
df_mismatches = filter_max_N_in_group(config, df_mismatches)

if len(df_mismatches) == 0:
s = f"{config['sample']} df_mismatches.query('max_N_in_group > 0') is empty."
logger.debug("WARNING: BadDataError")
logger.debug(s)
raise BadDataError(s)

# # filter out tax_id's with 0 k_sum_total
# df_mismatches = filter_k_sum(config, df_mismatches)

Expand Down
41 changes: 30 additions & 11 deletions src/metaDMG/fit/mismatches.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,37 @@ def add_k_sum_counts(df):
return df


def compute_min_N_in_group(group, config):
if config["forward_only"]:
return group[group.position > 0][bases_forward[0]].min()
else:
min_N_forward = group[group.position > 0][bases_forward[0]].min()
min_N_reverse = group[group.position < 0][bases_reverse[0]].min()
return min(min_N_forward, min_N_reverse)
def compute_min_max_N_in_group(group, config):
min_N, max_N = group["N"].min(), group["N"].max()
return pd.Series({"min_N_in_group": min_N, "max_N_in_group": max_N})

# mask_forward = group.position > 0
# mask_reverse = group.position < 0

# if config["forward_only"]:
# min_N = group[mask_forward][bases_forward[0]].min()
# max_N = group[mask_forward][bases_forward[0]].max()
# # return min_N, max_N
# return pd.Series({"min_N_in_group": min_N, "max_N_in_group": max_N})

# min_N_forward = group[mask_forward > 0][bases_forward[0]].min()
# min_N_reverse = group[mask_reverse < 0][bases_reverse[0]].min()
# min_N = min(min_N_forward, min_N_reverse)

def add_min_N_in_group(df, config):
ds = df.groupby("tax_id").apply(compute_min_N_in_group, config)
ds = ds.reset_index().rename(columns={0: "min_N_in_group"})
# max_N_forward = group[mask_forward > 0][bases_forward[0]].max()
# max_N_reverse = group[mask_reverse < 0][bases_reverse[0]].max()
# max_N = max(max_N_forward, max_N_reverse)

# # return min_N, max_N
# return pd.Series({"min_N_in_group": min_N, "max_N_in_group": max_N})


def add_min_max_N_in_group(df, config):
ds = (
df.groupby("tax_id")
.apply(compute_min_max_N_in_group, config)
.reset_index(drop=False)
)
df = pd.merge(df, ds, on=["tax_id"])
return df

Expand Down Expand Up @@ -213,7 +232,7 @@ def compute(config: Config) -> pd.DataFrame:
.pipe(make_reverse_position_negative)
.pipe(add_k_N_x_names, config)
.pipe(add_k_sum_counts)
.pipe(add_min_N_in_group, config)
.pipe(add_min_max_N_in_group, config)
.pipe(make_tax_id_str)
.reset_index(drop=True)
.fillna(0)
Expand Down

0 comments on commit 94d7975

Please sign in to comment.