Skip to content

Commit

Permalink
Checking input MSA characters
Browse files Browse the repository at this point in the history
  • Loading branch information
niklases committed Jan 5, 2024
1 parent 1bdd482 commit 7ba9f6a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
15 changes: 15 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,21 @@
"--params", "PLMC",
"--threads", "24"
]
},

{
"name": "Python: PyPEF !wrong! MSA input format (STO)",
"type": "python",
"request": "launch",
"env": {"PYTHONPATH": "${workspaceFolder}"},
"program": "${workspaceFolder}/pypef/main.py",
"console": "integratedTerminal",
"justMyCode": true,
"cwd": "${workspaceFolder}/datasets/AVGFP/",
"args": [
"param_inference",
"--msa", "uref100_avgfp_jhmmer_119.sto"
]
}
]
}
15 changes: 13 additions & 2 deletions pypef/utils/variant_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ def get_sequences_from_file(
values = []
names_of_mutations = []

allowed_chars = "ABCDEFGHIKLMNPQRSTVWYX-."
allowed_chars += allowed_chars.lower()

with open(fasta, 'r') as f:
words = ""
for line in f:
Expand All @@ -132,9 +135,17 @@ def get_sequences_from_file(

else:
try:
words += line.strip()
line = line.strip()
if any(not c in line for c in allowed_chars):
for c in line:
if c not in allowed_chars:
raise SystemError(
f"The input file(s) (MSA or train/test sets) contain(s) unknown protein sequence characters "
f"(e.g.: \"{c}\"). Note that an MSA has to be provided in FASTA or A2M format (or formatted as "
F"pure linebreak-separated sequences).")
words += line
except IndexError:
raise IndexError("Learning or Validation sets (.fasta) likely "
raise IndexError("Sequences in input file(s) likely "
"have emtpy lines (e.g. at end of file)")
if words != "":
sequences.append(words)
Expand Down

0 comments on commit 7ba9f6a

Please sign in to comment.