Fix #239

atlanhq · Jan 3, 2019 · f605bd8 · f605bd8
1 parent 7a0acd7
commit f605bd8
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 19 deletions.
diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -107,10 +107,10 @@ def _save_page(self, filepath, page, temp):
                 outfile.write(f)
             layout, dim = get_page_layout(fpath)
             # fix rotated PDF
-            lttextlh = get_text_objects(layout, ltype="lh")
-            lttextlv = get_text_objects(layout, ltype="lv")
-            ltchar = get_text_objects(layout, ltype="char")
-            rotation = get_rotation(lttextlh, lttextlv, ltchar)
+            chars = get_text_objects(layout, ltype="char")
+            horizontal_text = get_text_objects(layout, ltype="horizontal_text")
+            vertical_text = get_text_objects(layout, ltype="vertical_text")
+            rotation = get_rotation(chars, horizontal_text, vertical_text)
             if rotation != '':
                 fpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext])
                 os.rename(fpath, fpath_new)

diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
@@ -13,7 +13,8 @@ def _generate_layout(self, filename, layout_kwargs):
         self.layout_kwargs = layout_kwargs
         self.layout, self.dimensions = get_page_layout(
             filename, **layout_kwargs)
-        self.horizontal_text = get_text_objects(self.layout, ltype="lh")
-        self.vertical_text = get_text_objects(self.layout, ltype="lv")
+        self.images = get_text_objects(self.layout, ltype='image')
+        self.horizontal_text = get_text_objects(self.layout, ltype='horizontal_text')
+        self.vertical_text = get_text_objects(self.layout, ltype='vertical_text')
         self.pdf_width, self.pdf_height = self.dimensions
         self.rootname, __ = os.path.splitext(self.filename)
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -356,8 +356,12 @@ def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
             logger.info('Processing {}'.format(os.path.basename(self.rootname)))
 
         if not self.horizontal_text:
-            warnings.warn("No tables found on {}".format(
-                os.path.basename(self.rootname)))
+            if self.images:
+                warnings.warn('The page is image-based, Camelot only works with'
+                              ' text-based PDF pages.'.format(os.path.basename(self.rootname)))
+            else:
+                warnings.warn('No tables found on {}'.format(
+                    os.path.basename(self.rootname)))
             return []
 
         self._generate_image()

diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
@@ -395,8 +395,12 @@ def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}):
             logger.info('Processing {}'.format(os.path.basename(self.rootname)))
 
         if not self.horizontal_text:
-            warnings.warn("No tables found on {}".format(
-                os.path.basename(self.rootname)))
+            if self.images:
+                warnings.warn('The page is image-based, Camelot only works with'
+                              ' text-based PDF pages.'.format(os.path.basename(self.rootname)))
+            else:
+                warnings.warn('No tables found on {}'.format(
+                    os.path.basename(self.rootname)))
             return []
 
         self._generate_table_bbox()

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -20,7 +20,7 @@
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import (LAParams, LTAnno, LTChar, LTTextLineHorizontal,
-                             LTTextLineVertical)
+                             LTTextLineVertical, LTImage)
 
 
 PY3 = sys.version_info[0] >= 3
@@ -270,15 +270,15 @@ def scale_image(tables, v_segments, h_segments, factors):
     return tables_new, v_segments_new, h_segments_new
 
 
-def get_rotation(lttextlh, lttextlv, ltchar):
+def get_rotation(chars, horizontal_text, vertical_text):
     """Detects if text in table is rotated or not using the current
     transformation matrix (CTM) and returns its orientation.
 
     Parameters
     ----------
-    lttextlh : list
+    horizontal_text : list
         List of PDFMiner LTTextLineHorizontal objects.
-    lttextlv : list
+    vertical_text : list
         List of PDFMiner LTTextLineVertical objects.
     ltchar : list
         List of PDFMiner LTChar objects.
@@ -292,8 +292,8 @@ def get_rotation(lttextlh, lttextlv, ltchar):
 
     """
     rotation = ''
-    hlen = len([t for t in lttextlh if t.get_text().strip()])
-    vlen = len([t for t in lttextlv if t.get_text().strip()])
+    hlen = len([t for t in horizontal_text if t.get_text().strip()])
+    vlen = len([t for t in vertical_text if t.get_text().strip()])
     if hlen < vlen:
         clockwise = sum(t.matrix[1] < 0 and t.matrix[2] > 0 for t in ltchar)
         anticlockwise = sum(t.matrix[1] > 0 and t.matrix[2] < 0 for t in ltchar)
@@ -713,11 +713,13 @@ def get_text_objects(layout, ltype="char", t=None):
         List of PDFMiner text objects.
 
     """
-    if ltype == "char":
+    if ltype == 'char':
         LTObject = LTChar
-    elif ltype == "lh":
+    elif ltype == 'image':
+        LTObject = LTImage
+    elif ltype == 'horizontal_text':
         LTObject = LTTextLineHorizontal
-    elif ltype == "lv":
+    elif ltype == 'vertical_text':
         LTObject = LTTextLineVertical
     if t is None:
         t = []