Merge pull request #89 from tabulapdf/feature/warnAboutScans

notes first page that has text elements in pages JSON @jazzido: Let me know if you think this introduces a regression.
tabulapdf · Jan 25, 2015 · 00a15c3 · 00a15c3 · jazzido · Jan 25, 2015
2 parents 95df965 + 0fc899f
commit 00a15c3
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 5 deletions.
diff --git a/lib/tabula/entities/page.rb b/lib/tabula/entities/page.rb
@@ -191,6 +191,10 @@ def number(indexing_base=:one_indexed)
       end
     end
 
+    def has_text?
+      !self.texts.empty?
+    end
+
     # TODO no need for this, let's choose one name
     def ruling_lines
       get_ruling_lines!
@@ -258,7 +262,7 @@ def to_json(options={})
         :height => self.height,
         :number => self.number,
         :rotation => self.rotation,
-        :texts => self.texts
+        :hasText => self.has_text?
       }.to_json(options)
     end
 

diff --git a/lib/tabula/extraction.rb b/lib/tabula/extraction.rb
@@ -371,26 +371,37 @@ def debugPath(path)
 
 
     class PagesInfoExtractor
-      def initialize(pdf_filename, password='')
-        @pdf_filename = pdf_filename
-        @pdf_file = Extraction.openPDF(pdf_filename, password)
+      def initialize(pdf_file_path, password='')
+        @pdf_filename = pdf_file_path
+        @pdf_file = Extraction.openPDF(pdf_file_path, password)
         @all_pages = @pdf_file.getDocumentCatalog.getAllPages
+
+        @extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
       end
 
       def pages
+        found_page_with_texts = false
         Enumerator.new do |y|
           begin
             @all_pages.each_with_index do |page, i|
               contents = page.getContents
 
-              y.yield Tabula::Page.new(@pdf_filename,
+              if found_page_with_texts
+                page = Tabula::Page.new(@pdf_filename,
                                        page.findCropBox.width,
                                        page.findCropBox.height,
                                        page.getRotation.to_i,
                                        i+1) #remember, these are one-indexed
+              else 
+                page = @extractor.extract_page(i+1)
+                found_page_with_texts = page.has_text?
+              end
+
+              y.yield page
             end
           ensure
             @pdf_file.close
+            @extractor.close!
           end
         end
       end