forked from kaixindelele/ChatPaper
-
Notifications
You must be signed in to change notification settings - Fork 2
/
test_pdfminer.py
47 lines (34 loc) · 1.34 KB
/
test_pdfminer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
def convert_pdf_to_string(file_path):
output_string = StringIO()
with open(file_path, "rb") as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return output_string.getvalue()
def convert_title_to_filename(title):
filename = title.lower()
filename = filename.replace(" ", "_")
return filename
def split_to_title_and_pagenum(table_of_contents_entry):
title_and_pagenum = table_of_contents_entry.strip()
title = None
pagenum = None
if len(title_and_pagenum) > 0:
if title_and_pagenum[-1].isdigit():
i = -2
while title_and_pagenum[i].isdigit():
i -= 1
title = title_and_pagenum[:i].strip()
pagenum = int(title_and_pagenum[i:].strip())
return title, pagenum