Skip to content
This repository has been archived by the owner on Jan 6, 2025. It is now read-only.

Commit

Permalink
Fix #239
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayak-mehta committed Jan 3, 2019
1 parent 7a0acd7 commit f605bd8
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 19 deletions.
8 changes: 4 additions & 4 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ def _save_page(self, filepath, page, temp):
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
lttextlh = get_text_objects(layout, ltype="lh")
lttextlv = get_text_objects(layout, ltype="lv")
ltchar = get_text_objects(layout, ltype="char")
rotation = get_rotation(lttextlh, lttextlv, ltchar)
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != '':
fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
os.rename(fpath, fpath_new)
Expand Down
5 changes: 3 additions & 2 deletions camelot/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def _generate_layout(self, filename, layout_kwargs):
self.layout_kwargs = layout_kwargs
self.layout, self.dimensions = get_page_layout(
filename, **layout_kwargs)
self.horizontal_text = get_text_objects(self.layout, ltype="lh")
self.vertical_text = get_text_objects(self.layout, ltype="lv")
self.images = get_text_objects(self.layout, ltype='image')
self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)
8 changes: 6 additions & 2 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,12 @@ def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
logger.info('Processing {}'.format(os.path.basename(self.rootname)))

if not self.horizontal_text:
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
if self.images:
warnings.warn('The page is image-based, Camelot only works with'
' text-based PDF pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname)))
return []

self._generate_image()
Expand Down
8 changes: 6 additions & 2 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,12 @@ def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
logger.info('Processing {}'.format(os.path.basename(self.rootname)))

if not self.horizontal_text:
warnings.warn("No tables found on {}".format(
os.path.basename(self.rootname)))
if self.images:
warnings.warn('The page is image-based, Camelot only works with'
' text-based PDF pages.'.format(os.path.basename(self.rootname)))
else:
warnings.warn('No tables found on {}'.format(
os.path.basename(self.rootname)))
return []

self._generate_table_bbox()
Expand Down
20 changes: 11 additions & 9 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
LTTextLineVertical)
LTTextLineVertical, LTImage)


PY3 = sys.version_info[0] >= 3
Expand Down Expand Up @@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
return tables_new, v_segments_new, h_segments_new


def get_rotation(lttextlh, lttextlv, ltchar):
def get_rotation(chars, horizontal_text, vertical_text):
"""Detects if text in table is rotated or not using the current
transformation matrix (CTM) and returns its orientation.
Parameters
----------
lttextlh : list
horizontal_text : list
List of PDFMiner LTTextLineHorizontal objects.
lttextlv : list
vertical_text : list
List of PDFMiner LTTextLineVertical objects.
ltchar : list
List of PDFMiner LTChar objects.
Expand All @@ -292,8 +292,8 @@ def get_rotation(lttextlh, lttextlv, ltchar):
"""
rotation = ''
hlen = len([t for t in lttextlh if t.get_text().strip()])
vlen = len([t for t in lttextlv if t.get_text().strip()])
hlen = len([t for t in horizontal_text if t.get_text().strip()])
vlen = len([t for t in vertical_text if t.get_text().strip()])
if hlen < vlen:
clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
Expand Down Expand Up @@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
List of PDFMiner text objects.
"""
if ltype == "char":
if ltype == 'char':
LTObject = LTChar
elif ltype == "lh":
elif ltype == 'image':
LTObject = LTImage
elif ltype == 'horizontal_text':
LTObject = LTTextLineHorizontal
elif ltype == "lv":
elif ltype == 'vertical_text':
LTObject = LTTextLineVertical
if t is None:
t = []
Expand Down

0 comments on commit f605bd8

Please sign in to comment.