Skip to content

Commit

Permalink
further updates, for sub-question extraction.
Browse files Browse the repository at this point in the history
  • Loading branch information
djl11 committed Jan 23, 2025
1 parent 4ce9f8f commit 0217539
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 5 deletions.
24 changes: 19 additions & 5 deletions interfaces/ai_tutor/fetch_and_parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import unify
from prompts import *
from helpers import encode_image
from helpers import encode_image, parse_key

url = (
"https://www.ocr.org.uk/Images/169000-foundation-tier-sample-assessment"
Expand Down Expand Up @@ -92,7 +92,7 @@ def _fill_missing_questions_n_pages(questions_to_pages):
new_questions_to_pages = questions_to_pages.copy()
for i, (question_alphanum, pages) in enumerate(questions_to_pages.items()):
question_num = int(question_alphanum.split(".")[0])
if question_num != prev_question_num + 1:
if question_num not in (prev_question_num, prev_question_num + 1):
min_pg = min(questions_to_pages[i-1])
max_pg = max(pages)
union_of_pages = list(range(min_pg, max_pg + 1))
Expand All @@ -105,7 +105,7 @@ def _fill_missing_questions_n_pages(questions_to_pages):
new_questions_to_pages[question_alphanum] = (
list(range(prev_pages[-1], pages[-1] + 1))
)
prev_question_num = question_alphanum
prev_question_num = question_num
prev_pages = pages
return dict(sorted(new_questions_to_pages.items()))

Expand Down Expand Up @@ -254,7 +254,9 @@ def parse_into_pages():
return _fill_missing_questions_n_pages(question_to_pages), latest_num

question_to_pages, num_questions = parse_into_pages()

question_to_pages = dict(
sorted(question_to_pages.items(), key=lambda item: parse_key(item[0]))
)
with open(os.path.join(paper_dir, "question_to_pages.json"), "w+") as file:
file.write(json.dumps(question_to_pages, indent=4))

Expand All @@ -267,7 +269,15 @@ def parse_question(question_num: int):
cache=True,
system_message=TEXT_ONLY_DETECTION,
)
pages = question_to_pages[question_num]
sub_questions = [
k.split(".")[-1] for k, v in question_to_pages.items()
if k.startswith(str(question_num) + ".")
]
pages = [
v for k, v in question_to_pages.items()
if (k == str(question_num) or k.startswith(str(question_num) + "."))
]
pages = list(dict.fromkeys([item for sublist in pages for item in sublist]))
current_text = "".join([reader.pages[pg - 1].extract_text() for pg in pages])
imgs = [all_images[pg - 1] for pg in pages]
question_parser.set_system_message(
Expand Down Expand Up @@ -311,6 +321,7 @@ def parse_question(question_num: int):
"text": question_parsed,
"text-only": text_only,
"pages": pages,
"sub-questions": sub_questions,
"correctly_parsed": True,
}
parsed = json.dumps(dict(sorted(questions.items())), indent=4)
Expand Down Expand Up @@ -524,6 +535,9 @@ def parse_into_pages():
return _fill_missing_questions_n_pages(question_to_pages), max(detected_numeric)

question_to_pages, num_questions = parse_into_pages()
question_to_pages = dict(
sorted(question_to_pages.items(), key=lambda item: parse_key(item[0]))
)
with open(os.path.join(markscheme_dir, "question_to_pages.json"), "w+") as file:
file.write(json.dumps(question_to_pages, indent=4))

Expand Down
12 changes: 12 additions & 0 deletions interfaces/ai_tutor/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@
import base64


def parse_key(k: str):
"""
Splits the string on the first dot.
- The part before the dot is turned into an integer (if there's no dot, the whole key is an integer).
- The remainder (if any) is kept as a suffix for secondary sorting.
"""
parts = k.split('.', 1)
num = int(parts[0]) # integer portion
suffix = parts[1] if len(parts) > 1 else "" # suffix after the first dot, if any
return num, suffix


def encode_image(image_path):
_, buffer = cv2.imencode(".jpg", image_path)
return base64.b64encode(buffer).decode("utf-8")
Expand Down

0 comments on commit 0217539

Please sign in to comment.