-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcustom_pdf_to_xlsx.py
90 lines (68 loc) · 3.38 KB
/
custom_pdf_to_xlsx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pdfplumber
import openpyxl
import re
import os
import sys
MAX_NUM_ROWS_PER_EXCEL = 20
COLUMN_NAMES = ['Описание', 'ДДС', 'Брой', 'Ед. цена']
regex_pattern = re.compile(r'''
\d{5,}\s+ # UNUSED serial numer
(.{1,25})\s+ # description
[А-Я]{2}\s+ # UNUSED type of package
([\d,]+)\s+ # single item price
([\d,]+)\s+ # quantity 1
[\d,]+\s+ # UNUSED price
(\d+) # quantity 2
.*
([А-Я]) # VAT
''', re.VERBOSE)
curr_dir = os.getcwd() # Set to current directory
# Construct the path to the 'pdfs' directory inside 'curr_dir'
pdfs_directory = os.path.join(curr_dir, 'pdfs')
# Change the current working directory to 'pdfs'
try:
os.chdir(pdfs_directory)
except FileNotFoundError:
print(f"Error: The specified 'pdfs' directory '{pdfs_directory}' does not exist.")
sys.exit(1)
# Create the 'xlsx' directory if it doesn't exist
xlsx_directory = os.path.join(curr_dir, 'xlsx')
os.makedirs(xlsx_directory, exist_ok=True)
# Get a list of PDF files in the current directory
pdf_files = [file for file in os.listdir() if file.endswith('.pdf')]
# Iterate through each PDF file
for pdf_file_name in pdf_files:
# Open the PDF file using pdfplumber
with pdfplumber.open(pdf_file_name) as pdf:
excel_rows = [] # list of all the matching rows that will be added to excel file(s)
# Iterate through each page in the PDF
for page in pdf.pages:
text = page.extract_text()
# Iterate through each line on that page and match with the regex pattern
for line in text.split('\n'):
if match := regex_pattern.search(line):
description, single_item_price, quantity1, quantity2, VAT = match.groups()
quantity1 = float(quantity1.replace(',', '.'))
quantity2 = int(quantity2)
quantity = quantity1 * quantity2
# if the quantity is in the description (for example "180БР. ЯЙЦА L МCH", fix quantity and single price)
if match_description := re.search(r'(\d+)БР\.', description):
quantity_per_box = int(match_description.group(1))
quantity *= quantity_per_box
single_item_price = round(float(single_item_price.replace(',', '.')) / quantity_per_box, 9)
# save the row in the list
excel_rows.append([description, VAT, quantity, single_item_price])
# Extract the file name without the '.pdf' extension
file_name_without_extension = os.path.splitext(pdf_file_name)[0]
# Iterate through the matching lines and save them in excel files
for i in range(0, len(excel_rows), MAX_NUM_ROWS_PER_EXCEL):
# Create a new Excel workbook and select the active sheet
excel_workbook = openpyxl.Workbook()
excel_sheet = excel_workbook.active
# Set the first row as column names.
excel_sheet.append(COLUMN_NAMES)
# add up to MAX_NUM_ROWS_PER_EXCEL lines in a single xlsx file
for row in excel_rows[i:i + MAX_NUM_ROWS_PER_EXCEL]:
excel_sheet.append(row)
# Save the Excel file with the corresponding PDF file name
excel_workbook.save(os.path.join(xlsx_directory, f'{file_name_without_extension}_{i // MAX_NUM_ROWS_PER_EXCEL + 1}.xlsx'))