From 0fc899f760691fb6ec9bba914edddd9d0c19ca96 Mon Sep 17 00:00:00 2001
From: "Jeremy B. Merrill" <jeremybmerrill@gmail.com>
Date: Fri, 14 Nov 2014 17:29:22 -0700
Subject: [PATCH] add option to note first page that has text elements in pages
 JSON, remove texts key

---
 lib/tabula/entities/page.rb |  6 +++++-
 lib/tabula/extraction.rb    | 19 +++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/lib/tabula/entities/page.rb b/lib/tabula/entities/page.rb
index bd2a383..57299ec 100644
--- a/lib/tabula/entities/page.rb
+++ b/lib/tabula/entities/page.rb
@@ -191,6 +191,10 @@ def number(indexing_base=:one_indexed)
       end
     end
 
+    def has_text?
+      !self.texts.empty?
+    end
+
     # TODO no need for this, let's choose one name
     def ruling_lines
       get_ruling_lines!
@@ -258,7 +262,7 @@ def to_json(options={})
         :height => self.height,
         :number => self.number,
         :rotation => self.rotation,
-        :texts => self.texts
+        :hasText => self.has_text?
       }.to_json(options)
     end
 
diff --git a/lib/tabula/extraction.rb b/lib/tabula/extraction.rb
index d739dfe..e8434e6 100644
--- a/lib/tabula/extraction.rb
+++ b/lib/tabula/extraction.rb
@@ -371,26 +371,37 @@ def debugPath(path)
 
 
     class PagesInfoExtractor
-      def initialize(pdf_filename, password='')
-        @pdf_filename = pdf_filename
-        @pdf_file = Extraction.openPDF(pdf_filename, password)
+      def initialize(pdf_file_path, password='')
+        @pdf_filename = pdf_file_path
+        @pdf_file = Extraction.openPDF(pdf_file_path, password)
         @all_pages = @pdf_file.getDocumentCatalog.getAllPages
+
+        @extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all )
       end
 
       def pages
+        found_page_with_texts = false
         Enumerator.new do |y|
           begin
             @all_pages.each_with_index do |page, i|
               contents = page.getContents
 
-              y.yield Tabula::Page.new(@pdf_filename,
+              if found_page_with_texts
+                page = Tabula::Page.new(@pdf_filename,
                                        page.findCropBox.width,
                                        page.findCropBox.height,
                                        page.getRotation.to_i,
                                        i+1) #remember, these are one-indexed
+              else 
+                page = @extractor.extract_page(i+1)
+                found_page_with_texts = page.has_text?
+              end
+
+              y.yield page
             end
           ensure
             @pdf_file.close
+            @extractor.close!
           end
         end
       end