Skip to content

Commit

Permalink
Merge pull request #29 from QuivrHQ/feat/xlsx
Browse files Browse the repository at this point in the history
add: xlsx convertor
  • Loading branch information
chloedia authored Jun 17, 2024
2 parents bdb85b1 + 7cb20dd commit 248d414
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@ megaparse.egg-info/
build/*
ENV
venv
*/evaluations/*
*/cdp/*
*.pkl

!megaparse/tests/output_tests/MegaFake_report.md
30 changes: 29 additions & 1 deletion megaparse/Converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pathlib import Path
from llama_index.core import download_loader
from unstructured.partition.auto import partition

import pandas as pd

import nest_asyncio

Expand All @@ -39,6 +39,32 @@ def save_md(self, md_content: str, file_path: Path | str) -> None:
with open(file_path, "w") as f:
f.write(md_content)

class XLSXConverter(Converter):
def __init__(self) -> None:
pass

def convert(self, file_path: str) -> str:
xls = pd.ExcelFile(file_path) #type: ignore
sheets = pd.read_excel(xls)

target_text = self.table_to_text(sheets)

return target_text

def convert_tab(self, file_path: str, tab_name: str) -> str:
xls = pd.ExcelFile(file_path)
sheets = pd.read_excel(xls, tab_name)
target_text = self.table_to_text(sheets)
return target_text

def table_to_text(self, df):
text_rows = []
for _, row in df.iterrows():
row_text = " | ".join(str(value) for value in row.values if pd.notna(value))
if row_text:
text_rows.append("|" + row_text + "|")
return "\n".join(text_rows)


class DOCXConverter(Converter):
def __init__(self) -> None:
Expand Down Expand Up @@ -286,6 +312,8 @@ def convert(self, **kwargs) -> str:
converter = PPTXConverter()
elif file_extension == ".pdf":
converter = PDFConverter(llama_parse_api_key=self.llama_parse_api_key)
elif file_extension == ".xlsx":
converter = XLSXConverter()
else:
print(self.file_path, file_extension)
raise ValueError(f"Unsupported file extension: {file_extension}")
Expand Down

0 comments on commit 248d414

Please sign in to comment.