-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_generator.py
377 lines (314 loc) · 15.2 KB
/
pdf_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
from typing import Dict, List, Optional, Tuple
from pathlib import Path
from fpdf import FPDF
from bs4 import BeautifulSoup
from PIL import Image
import logging
import warnings
from datetime import datetime
import re
import html
import emoji
from cache_manager import CacheManager
class PDFGenerator:
def __init__(self, output_dir: Path):
self.output_dir = output_dir
self.output_dir.mkdir(parents=True, exist_ok=True)
self.font_name = "NotoSans"
self.emoji_scale = 0.85
# Setup emoji cache directory
self.cache_dir = Path(__file__).parent / "emoji_cache"
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.cache_manager = CacheManager(self.cache_dir)
# Configure logging
logging.getLogger('fontTools.subset').setLevel(logging.WARNING)
# Store font paths
self.font_dir = Path(__file__).parent / "fonts"
self.fonts = {
'': str(self.font_dir / 'NotoSans-Regular.ttf'),
'B': str(self.font_dir / 'NotoSans-Bold.ttf'),
'I': str(self.font_dir / 'NotoSans-Italic.ttf')
}
def _write_text_with_emojis(self, pdf: FPDF, text: str, font_size: int = 12, style: str = '') -> None:
"""Write text to PDF with improved line spacing and emoji handling."""
if not text:
return
# Calculate sizes
emoji_size = font_size * self.emoji_scale * (pdf.k / 72)
line_height = font_size * 1.5 # Increased line height for better readability
segments = self._split_text_and_emojis(text)
x_start = pdf.l_margin # Start from left margin
y_position = pdf.get_y()
current_x = x_start
line_width = 0
line_segments = []
# First pass: calculate line breaks
for is_emoji, content in segments:
if is_emoji:
segment_width = emoji_size
else:
pdf.set_font(self.font_name, style, font_size)
segment_width = pdf.get_string_width(content)
# Check if we need to start a new line
if current_x + segment_width > pdf.w - pdf.r_margin:
# Write the current line
self._write_line(pdf, line_segments, x_start, y_position, line_height, font_size, style)
# Reset for new line
y_position += line_height
current_x = x_start
line_segments = []
line_segments.append((is_emoji, content))
current_x += segment_width
# Write any remaining segments
if line_segments:
self._write_line(pdf, line_segments, x_start, y_position, line_height, font_size, style)
y_position += line_height
# Update PDF position
pdf.set_y(y_position)
def _split_text_and_emojis(self, text: str) -> List[Tuple[bool, str]]:
"""Split text into segments of regular text and emojis."""
segments = []
current_text = ""
i = 0
while i < len(text):
if any(text[i:].startswith(em) for em in emoji.EMOJI_DATA):
# Found an emoji
if current_text:
segments.append((False, current_text))
current_text = ""
# Find the complete emoji
for em in emoji.EMOJI_DATA:
if text[i:].startswith(em):
segments.append((True, em))
i += len(em)
break
else:
current_text += text[i]
i += 1
if current_text:
segments.append((False, current_text))
return segments
def create_pdf(self, post: Dict, images: List[Optional[Image.Image]]) -> Path:
"""Create a PDF from a blog post and its images."""
try:
pdf = self._create_pdf_instance()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
# Add title (centered)
title = self._clean_html_text(post.get('title', {}).get('rendered', 'Untitled'))
pdf.set_font(self.font_name, 'B', 16)
pdf.cell(0, 10, title, ln=True, align='C')
pdf.ln(5)
# Add horizontal line after title
pdf.line(20, pdf.get_y(), pdf.w - 20, pdf.get_y())
pdf.ln(5)
# Add formatted date (centered)
date = self._format_date(post.get('date', 'Unknown date'))
pdf.set_font(self.font_name, '', 12)
pdf.cell(0, 10, date, ln=True, align='C')
pdf.ln(5)
# Add horizontal line after date
pdf.line(20, pdf.get_y(), pdf.w - 20, pdf.get_y())
pdf.ln(5)
# Add tags with emoji support
tags = self._extract_tags(post)
if tags:
pdf.set_font(self.font_name, 'I', 12)
pdf.cell(0, 10, f"Tags: {tags}", ln=True, align='C')
pdf.ln(5)
# Add horizontal line after tags
pdf.line(20, pdf.get_y(), pdf.w - 20, pdf.get_y())
pdf.ln(10)
else:
pdf.ln(10)
# Add content with improved paragraph handling
content = post.get('content', {}).get('rendered', '')
paragraphs = self._process_content(content)
pdf.set_font(self.font_name, '', 12)
first_paragraph = True
for paragraph in paragraphs:
if paragraph.strip():
if not first_paragraph:
# Add consistent spacing between paragraphs
pdf.ln(8) # Adjust this value to control paragraph spacing
# Calculate if we need a page break
if pdf.get_y() + 20 > pdf.h - pdf.b_margin: # 20 is approximate height needed
pdf.add_page()
# Write paragraph with proper line height
self._write_text_with_emojis(pdf, paragraph, font_size=12)
first_paragraph = False
# Handle images with improved spacing
for image in images:
if image:
try:
pdf.add_page()
available_height = pdf.h - pdf.t_margin - pdf.b_margin
available_width = pdf.w - (2 * 20) # 20px margin on each side
# Calculate dimensions while maintaining aspect ratio
img_width = available_width
aspect = image.height / image.width
img_height = img_width * aspect
# Adjust if image is too tall
if img_height > available_height:
img_height = available_height
img_width = img_height / aspect
# Center the image horizontally
x_pos = (pdf.w - img_width) / 2
# Add some top margin
y_pos = pdf.t_margin + 10
pdf.image(image, x=x_pos, y=y_pos, w=img_width)
except Exception as e:
logging.warning(f"Failed to add image to PDF: {str(e)}")
# Save PDF
filename = self._get_filename(post)
pdf_path = self.output_dir / filename
pdf.output(str(pdf_path))
logging.info(f"Successfully created PDF: {filename}")
return pdf_path
except Exception as e:
logging.error(f"Failed to create PDF: {str(e)}")
raise
def _clean_html_text(self, html_text: str) -> str:
"""Clean HTML text and decode entities."""
if not html_text:
return ""
# First decode HTML entities
text = html.unescape(html_text)
# Then remove any remaining HTML tags
soup = BeautifulSoup(text, 'html.parser')
return soup.get_text()
def _format_date(self, date_str: str) -> str:
"""Format date string to YYYYMMDD @ HH:MM format."""
try:
dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
return dt.strftime('%Y%m%d @ %H:%M')
except Exception as e:
logging.error(f"Error formatting date: {str(e)}")
return date_str
def _get_filename(self, post: Dict) -> str:
"""Generate filename in YYYYMMDD_title format."""
try:
date_str = post.get('date', '')
dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
date_part = dt.strftime('%Y%m%d')
title = self._clean_html_text(post.get('title', {}).get('rendered', 'untitled'))
# Clean title for filename
title = re.sub(r'[^\w\s-]', '', title)
title = re.sub(r'[-\s]+', '_', title)
title = title[:50].strip('_')
return f"{date_part}_{title}.pdf"
except Exception as e:
logging.error(f"Error creating filename: {str(e)}")
return f"unknown_date_{post.get('id', 'unknown')}.pdf"
def _extract_tags(self, post: Dict) -> str:
"""Extract and format tags and categories from post data."""
tags = []
try:
logging.info("Starting tag extraction")
logging.info(f"Post data keys: {post.keys()}")
if '_embedded' in post:
logging.info("Found _embedded in post")
embedded_terms = post['_embedded'].get('wp:term', [])
logging.info(f"Embedded terms: {embedded_terms}")
for term_group in embedded_terms:
if isinstance(term_group, list):
for term in term_group:
if isinstance(term, dict):
taxonomy = term.get('taxonomy', '')
name = term.get('name', '')
if name:
# You can customize how different taxonomies are displayed
if taxonomy == 'category':
tags.append(f"📁 {name}") # Using emoji for categories
elif taxonomy == 'post_tag':
tags.append(f"🏷️ {name}") # Using emoji for tags
else:
tags.append(name)
logging.info(f"Added {taxonomy}: {name}")
# Remove duplicates while preserving order
tags = list(dict.fromkeys(tags))
logging.info(f"Final extracted tags: {tags}")
except Exception as e:
logging.error(f"Error extracting tags: {str(e)}", exc_info=True)
result = ', '.join(tags) if tags else ''
logging.info(f"Returning tags string: {result}")
return result
def _process_content(self, html_content: str) -> List[str]:
"""Process HTML content into clean paragraphs with proper spacing."""
if not html_content:
return ["No content available"]
try:
# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for element in soup(['script', 'style']):
element.decompose()
paragraphs = []
current_text = []
# Process elements in order they appear
for element in soup.children:
if element.name in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote']:
# If we have accumulated text, add it as a paragraph
if current_text:
combined_text = ' '.join(current_text).strip()
if combined_text:
paragraphs.append(combined_text)
current_text = []
# Process the block element
text = element.get_text(strip=True)
if text:
paragraphs.append(html.unescape(text))
elif element.name == 'br':
# Handle line breaks
if current_text:
combined_text = ' '.join(current_text).strip()
if combined_text:
paragraphs.append(combined_text)
current_text = []
elif isinstance(element, str):
# Handle text nodes
text = element.strip()
if text:
current_text.append(html.unescape(text))
# Add any remaining text
if current_text:
combined_text = ' '.join(current_text).strip()
if combined_text:
paragraphs.append(combined_text)
# Clean up paragraphs
cleaned_paragraphs = []
for para in paragraphs:
# Normalize whitespace
cleaned = ' '.join(para.split())
if cleaned:
cleaned_paragraphs.append(cleaned)
return cleaned_paragraphs if cleaned_paragraphs else ["No content available"]
except Exception as e:
logging.error(f"Error processing content: {str(e)}")
return ["Error processing content"]
def _create_pdf_instance(self) -> FPDF:
"""Create a new PDF instance with registered fonts."""
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=UserWarning)
pdf = FPDF()
for style, path in self.fonts.items():
pdf.add_font(family=self.font_name, style=style, fname=path, uni=True)
return pdf
def _write_line(self, pdf: FPDF, segments: List[Tuple[bool, str]], x: float, y: float, line_height: float, font_size: int, style: str) -> None:
"""Helper method to write a single line with mixed text and emojis."""
current_x = x
for is_emoji, content in segments:
if is_emoji:
try:
emoji_image = self.cache_manager.get_emoji_image(content)
if emoji_image:
emoji_size = font_size * self.emoji_scale * (pdf.k / 72)
pdf.image(emoji_image, x=current_x, y=y + (line_height - emoji_size)/2, h=emoji_size)
current_x += emoji_size
except Exception as e:
logging.warning(f"Failed to add emoji image: {str(e)}")
else:
pdf.set_xy(current_x, y)
pdf.set_font(self.font_name, style, font_size)
pdf.write(line_height, content)
current_x = pdf.get_x()