Skip to content

Commit

Permalink
Merge pull request #24 from dSupertramp/feat/add-docstrings-and-fix-o…
Browse files Browse the repository at this point in the history
…penai-key
  • Loading branch information
chloedia authored Jun 5, 2024
2 parents fb34ff0 + 96edcd5 commit 94dd1ba
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 22 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ dist/**
megaparse.egg-info/
*.pyc
build/*
ENV
venv

!megaparse/tests/output_tests/MegaFake_report.md
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ poppler-utils = "*"
markdownify = "*"
langchain-openai = "*"
langchain-core = "*"
python-dotenv = "*"

[dev-packages]
ipykernel = "*"
Expand Down
2 changes: 1 addition & 1 deletion megaparse/Converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def _unstructured_parse(self, file_path: str):
unstructured_parser = UnstructuredParser()
return unstructured_parser.convert(file_path)

def convert(self, file_path: str, gpt4o_cleaner = False) -> str:
def convert(self, file_path: str, gpt4o_cleaner=False) -> str:
parsed_md = ""
if self.llama_parse_api_key:
parsed_md = self._llama_parse(self.llama_parse_api_key, file_path)
Expand Down
114 changes: 94 additions & 20 deletions megaparse/markdown_processor.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,70 @@
import os
from collections import Counter
from typing import Literal
from typing import List, Tuple, Dict
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv


class MarkdownProcessor:
"""
Class for MarkdownProcessor.
"""

load_dotenv()

def __init__(self, md_result: str, strict: bool, remove_pagination: bool):
self.md_result = md_result
self.strict = strict
self.remove_pagination = remove_pagination

@staticmethod
def clean(text: str) -> str:
"""Clean the input text by removing newlines, double asterisks, and trimming whitespace."""
"""
Clean the input text by removing newlines, double asterisks, and trimming whitespace.
Args:
text (str): Input text
Returns:
str: Cleaned text
"""
text = text.replace("\n", "")
text = text.replace("**", "")
text = text.strip()
return text

def split_into_pages(self) -> list:
"""Split the markdown result into pages using triple newlines as the delimiter."""
def split_into_pages(self) -> List[str]:
"""
Split the markdown result into pages using triple newlines as the delimiter.
Returns:
List[str]: Splitted markdown
"""
return self.md_result.split("\n\n\n")

@staticmethod
def split_into_paragraphs(pages: list) -> list:
"""Split pages into paragraphs using double newlines as the delimiter."""
def split_into_paragraphs(pages: list) -> List[str]:
"""
Split pages into paragraphs using double newlines as the delimiter.
Args:
pages (list): Pages
Returns:
List[str]: Splitted pages
"""
return "\n\n".join(pages).split("\n\n")

def remove_duplicates(self, paragraphs: list) -> tuple:
"""Remove duplicate paragraphs and identify unique and duplicate paragraphs."""
def remove_duplicates(self, paragraphs: list) -> Tuple[str, List[str]]:
"""
Remove duplicate paragraphs and identify unique and duplicate paragraphs.
Args:
paragraphs (list): Paragraphs
Returns:
Tuple[str, List[str]]: Cleaned paragraphs and duplicate paragraphs
"""
unique_paragraphs = list(
set([self.clean(paragraph) for paragraph in paragraphs])
)
Expand All @@ -42,11 +78,18 @@ def remove_duplicates(self, paragraphs: list) -> tuple:
unique_paragraphs.remove(cleaned_paragraph)
else:
duplicate_paragraphs.append(paragraph)

return cleaned_paragraphs, duplicate_paragraphs

def identify_header_components(self, duplicate_paragraphs: list) -> dict:
"""Identify words in duplicate paragraphs that are likely header components."""
def identify_header_components(self, duplicate_paragraphs: list) -> Dict:
"""
Identify words in duplicate paragraphs that are likely header components.
Args:
duplicate_paragraphs (list): Duplicate paragraphs
Returns:
Dict: Header components
"""
header_components = list(
set([self.clean(paragraph) for paragraph in duplicate_paragraphs])
)
Expand All @@ -60,9 +103,18 @@ def identify_header_components(self, duplicate_paragraphs: list) -> dict:
return header_components_count

def remove_header_lines(
self, paragraphs: list, header_components_count: dict
) -> list:
"""Remove paragraphs that contain any of the header words or the word 'Page' if remove_pagination is true."""
self, paragraphs: List[str], header_components_count: Dict
) -> List[str]:
"""
Remove paragraphs that contain any of the header words or the word 'Page' if remove_pagination is true.
Args:
paragraphs (List[str]): Paragraphs
header_components_count (Dict): Header components
Returns:
List[str]: New paragraphs
"""

def should_remove(paragraph):
if self.remove_pagination and "Page" in paragraph:
Expand All @@ -71,17 +123,32 @@ def should_remove(paragraph):

return [paragraph for paragraph in paragraphs if not should_remove(paragraph)]

def merge_tables(self, md_content: str):
def merge_tables(self, md_content: str) -> str:
"""
Merge tables inside Markdown content.
Args:
md_content (str): Markdown content
Returns:
str: Merged tables
"""
md_content = md_content.replace("|\n\n|", "|\n|")
return md_content

def save_cleaned_result(self, cleaned_result: str, output_path: str):
"""Save the cleaned paragraphs to a markdown file."""
def save_cleaned_result(self, cleaned_result: str, output_path: str) -> None:
"""
Save the cleaned paragraphs to a markdown file.
Args:
cleaned_result (str): Cleaned result
output_path (str): Output path
"""
with open(output_path, "w") as f:
f.write(cleaned_result)

def remove_header_llm(self):
llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))
# Define the prompt
messages = [
(
Expand All @@ -102,9 +169,16 @@ def remove_header_llm(self):

return result.content

def process(self, gpt4o_cleaner=False):
"""Process the markdown result by removing duplicate paragraphs and headers."""
def process(self, gpt4o_cleaner=False) -> str:
"""
Process the markdown result by removing duplicate paragraphs and headers.
Args:
gpt4o_cleaner (bool, optional): GPT-4o cleaner. Defaults to False.
Returns:
str: Cleaned result
"""
if gpt4o_cleaner:
cleaned_result = self.remove_header_llm()

Expand Down
5 changes: 4 additions & 1 deletion megaparse/unstructured.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from unstructured.partition.pdf import partition_pdf
from dotenv import load_dotenv
import os


class UnstructuredParser:
load_dotenv()

# Function to convert element category to markdown format
def convert_to_markdown(self, elements):
Expand Down Expand Up @@ -73,7 +76,7 @@ def partition_pdf_file(self, path):
)

def improve_table_elements(self, elements):
llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))

# Define the prompt
messages = [
Expand Down

0 comments on commit 94dd1ba

Please sign in to comment.