Skip to content

Commit

Permalink
fix: handle missing locus tag in MIBIG entries
Browse files Browse the repository at this point in the history
  • Loading branch information
matinnuhamunada committed Jan 11, 2024
1 parent 0d2facc commit 077023f
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
5 changes: 4 additions & 1 deletion workflow/bgcflow/bgcflow/features/clinker_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ def build_cds_dict(gbk_dir):
locus_tag = feature.qualifiers["protein_id"][0]
logging.warning(locus_tag)
if seq_id.startswith("BGC"):
locus_tag = feature.qualifiers["protein_id"][0]
if "protein_id" in feature.qualifiers.keys():
locus_tag = feature.qualifiers["protein_id"][0]
else:
locus_tag = feature.qualifiers["locus_tag"][0]
cds_dict[locus_tag] = {
"seq_id": seq_id,
"start": start,
Expand Down
17 changes: 17 additions & 0 deletions workflow/bgcflow/bgcflow/features/mmseqs2_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ def annotate_mmseqs2_cog(mmseqs2_cog: str, gbk_path: str, outfile: str) -> None:
for f in record.features:
if f.type == "CDS":
value = {k: ",".join(v) for k, v in f.qualifiers.items()}
if "locus_tag" not in value.keys():
logging.warning(
f"Could not find locus_tag in {file_id} {seq_id}. Available keys: {value.keys()}"
)
for locus_tag in ["protein_id", "gene"]:
if locus_tag in value:
logging.info(
f"Using {locus_tag} as locus_tag in {file_id} {seq_id}."
)
value["locus_tag"] = value[locus_tag]
break
value["file_id"] = file_id
value["seq_id"] = seq_id
value["start"] = int(f.location.start)
Expand All @@ -51,9 +62,15 @@ def annotate_mmseqs2_cog(mmseqs2_cog: str, gbk_path: str, outfile: str) -> None:
df_annotation = pd.DataFrame.from_dict(output).T

logging.info("Merging dataframes...")
logging.info(f"Length of MMseqs2 COG dataframe: {df_mmseqs2.shape}")
logging.info(f"Length of annotation dataframe: {df_annotation.shape}")
df_mmseqs2.set_index("locus_tag").to_csv("mmseqs2_cog2.csv")
df_annotation.set_index("locus_tag").to_csv("annotation2.csv")
df = df_mmseqs2.merge(
df_annotation, right_on="locus_tag", left_on="locus_tag", how="outer"
)
logging.info(f"Length of merged dataframe: {df.shape}")
df.set_index("locus_tag").to_csv("merged2s.csv")
assert len(df_mmseqs2) == len(
df
), "Error: Merged dataframe has different length than original MMseqs2 COG dataframe."
Expand Down

0 comments on commit 077023f

Please sign in to comment.