From 0fc899f760691fb6ec9bba914edddd9d0c19ca96 Mon Sep 17 00:00:00 2001 From: "Jeremy B. Merrill" Date: Fri, 14 Nov 2014 17:29:22 -0700 Subject: [PATCH] add option to note first page that has text elements in pages JSON, remove texts key --- lib/tabula/entities/page.rb | 6 +++++- lib/tabula/extraction.rb | 19 +++++++++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/lib/tabula/entities/page.rb b/lib/tabula/entities/page.rb index bd2a383..57299ec 100644 --- a/lib/tabula/entities/page.rb +++ b/lib/tabula/entities/page.rb @@ -191,6 +191,10 @@ def number(indexing_base=:one_indexed) end end + def has_text? + !self.texts.empty? + end + # TODO no need for this, let's choose one name def ruling_lines get_ruling_lines! @@ -258,7 +262,7 @@ def to_json(options={}) :height => self.height, :number => self.number, :rotation => self.rotation, - :texts => self.texts + :hasText => self.has_text? }.to_json(options) end diff --git a/lib/tabula/extraction.rb b/lib/tabula/extraction.rb index d739dfe..e8434e6 100644 --- a/lib/tabula/extraction.rb +++ b/lib/tabula/extraction.rb @@ -371,26 +371,37 @@ def debugPath(path) class PagesInfoExtractor - def initialize(pdf_filename, password='') - @pdf_filename = pdf_filename - @pdf_file = Extraction.openPDF(pdf_filename, password) + def initialize(pdf_file_path, password='') + @pdf_filename = pdf_file_path + @pdf_file = Extraction.openPDF(pdf_file_path, password) @all_pages = @pdf_file.getDocumentCatalog.getAllPages + + @extractor = Tabula::Extraction::ObjectExtractor.new(pdf_file_path, :all ) end def pages + found_page_with_texts = false Enumerator.new do |y| begin @all_pages.each_with_index do |page, i| contents = page.getContents - y.yield Tabula::Page.new(@pdf_filename, + if found_page_with_texts + page = Tabula::Page.new(@pdf_filename, page.findCropBox.width, page.findCropBox.height, page.getRotation.to_i, i+1) #remember, these are one-indexed + else + page = @extractor.extract_page(i+1) + found_page_with_texts = page.has_text? + end + + y.yield page end ensure @pdf_file.close + @extractor.close! end end end