Checking input MSA characters

niklases · Jan 5, 2024 · 7ba9f6a · 7ba9f6a
1 parent 1bdd482
commit 7ba9f6a
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 2 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -189,6 +189,21 @@
                 "--params", "PLMC",
                 "--threads", "24"
             ]
+        },
+
+        {
+            "name": "Python: PyPEF !wrong! MSA input format (STO)",
+            "type": "python",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "param_inference", 
+                "--msa", "uref100_avgfp_jhmmer_119.sto"
+            ]
         }
     ]
 }
diff --git a/pypef/utils/variant_data.py b/pypef/utils/variant_data.py
@@ -110,6 +110,9 @@ def get_sequences_from_file(
     values = []
     names_of_mutations = []
 
+    allowed_chars = "ABCDEFGHIKLMNPQRSTVWYX-."
+    allowed_chars += allowed_chars.lower()
+
     with open(fasta, 'r') as f:
         words = ""
         for line in f:
@@ -132,9 +135,17 @@ def get_sequences_from_file(
 
             else:
                 try:
-                    words += line.strip()
+                    line = line.strip()
+                    if any(not c in line for c in allowed_chars):
+                        for c in line:
+                            if c not in allowed_chars:
+                                raise SystemError(
+                                    f"The input file(s) (MSA or train/test sets) contain(s) unknown protein sequence characters "
+                                    f"(e.g.: \"{c}\"). Note that an MSA has to be provided in FASTA or A2M format (or formatted as "
+                                    F"pure linebreak-separated sequences).")
+                    words += line
                 except IndexError:
-                    raise IndexError("Learning or Validation sets (.fasta) likely "
+                    raise IndexError("Sequences in input file(s) likely "
                                      "have emtpy lines (e.g. at end of file)")
         if words != "":
             sequences.append(words)