further updates, for sub-question extraction.

unifyai · Jan 23, 2025 · 0217539 · 0217539
1 parent 4ce9f8f
commit 0217539
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 5 deletions.
diff --git a/interfaces/ai_tutor/fetch_and_parse_pdf.py b/interfaces/ai_tutor/fetch_and_parse_pdf.py
@@ -9,7 +9,7 @@
 
 import unify
 from prompts import *
-from helpers import encode_image
+from helpers import encode_image, parse_key
 
 url = (
     "https://www.ocr.org.uk/Images/169000-foundation-tier-sample-assessment"
@@ -92,7 +92,7 @@ def _fill_missing_questions_n_pages(questions_to_pages):
     new_questions_to_pages = questions_to_pages.copy()
     for i, (question_alphanum, pages) in enumerate(questions_to_pages.items()):
         question_num = int(question_alphanum.split(".")[0])
-        if question_num != prev_question_num + 1:
+        if question_num not in (prev_question_num, prev_question_num + 1):
             min_pg = min(questions_to_pages[i-1])
             max_pg = max(pages)
             union_of_pages = list(range(min_pg, max_pg + 1))
@@ -105,7 +105,7 @@ def _fill_missing_questions_n_pages(questions_to_pages):
             new_questions_to_pages[question_alphanum] = (
                 list(range(prev_pages[-1], pages[-1] + 1))
             )
-        prev_question_num = question_alphanum
+        prev_question_num = question_num
         prev_pages = pages
     return dict(sorted(new_questions_to_pages.items()))
 
@@ -254,7 +254,9 @@ def parse_into_pages():
         return _fill_missing_questions_n_pages(question_to_pages), latest_num
 
     question_to_pages, num_questions = parse_into_pages()
-
+    question_to_pages = dict(
+        sorted(question_to_pages.items(), key=lambda item: parse_key(item[0]))
+    )
     with open(os.path.join(paper_dir, "question_to_pages.json"), "w+") as file:
         file.write(json.dumps(question_to_pages, indent=4))
 
@@ -267,7 +269,15 @@ def parse_question(question_num: int):
             cache=True,
             system_message=TEXT_ONLY_DETECTION,
         )
-        pages = question_to_pages[question_num]
+        sub_questions = [
+            k.split(".")[-1] for k, v in question_to_pages.items()
+            if k.startswith(str(question_num) + ".")
+        ]
+        pages = [
+            v for k, v in question_to_pages.items()
+            if (k == str(question_num) or k.startswith(str(question_num) + "."))
+        ]
+        pages = list(dict.fromkeys([item for sublist in pages for item in sublist]))
         current_text = "".join([reader.pages[pg - 1].extract_text() for pg in pages])
         imgs = [all_images[pg - 1] for pg in pages]
         question_parser.set_system_message(
@@ -311,6 +321,7 @@ def parse_question(question_num: int):
             "text": question_parsed,
             "text-only": text_only,
             "pages": pages,
+            "sub-questions": sub_questions,
             "correctly_parsed": True,
         }
         parsed = json.dumps(dict(sorted(questions.items())), indent=4)
@@ -524,6 +535,9 @@ def parse_into_pages():
         return _fill_missing_questions_n_pages(question_to_pages), max(detected_numeric)
 
     question_to_pages, num_questions = parse_into_pages()
+    question_to_pages = dict(
+        sorted(question_to_pages.items(), key=lambda item: parse_key(item[0]))
+    )
     with open(os.path.join(markscheme_dir, "question_to_pages.json"), "w+") as file:
         file.write(json.dumps(question_to_pages, indent=4))
 

diff --git a/interfaces/ai_tutor/helpers.py b/interfaces/ai_tutor/helpers.py
@@ -4,6 +4,18 @@
 import base64
 
 
+def parse_key(k: str):
+    """
+    Splits the string on the first dot.
+    - The part before the dot is turned into an integer (if there's no dot, the whole key is an integer).
+    - The remainder (if any) is kept as a suffix for secondary sorting.
+    """
+    parts = k.split('.', 1)
+    num = int(parts[0])                # integer portion
+    suffix = parts[1] if len(parts) > 1 else ""  # suffix after the first dot, if any
+    return num, suffix
+
+
 def encode_image(image_path):
     _, buffer = cv2.imencode(".jpg", image_path)
     return base64.b64encode(buffer).decode("utf-8")