Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEATURE: Extract Paragraphs #524

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/pdf/reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def root
require 'pdf/reader/bounding_rectangle_runs_filter'
require 'pdf/reader/cid_widths'
require 'pdf/reader/cmap'
require 'pdf/reader/disjoint_set'
require 'pdf/reader/encoding'
require 'pdf/reader/error'
require 'pdf/reader/filter'
Expand All @@ -303,6 +304,7 @@ def root
require 'pdf/reader/object_hash'
require 'pdf/reader/object_stream'
require 'pdf/reader/pages_strategy'
require 'pdf/reader/paragraph'
require 'pdf/reader/parser'
require 'pdf/reader/point'
require 'pdf/reader/print_receiver'
Expand Down
73 changes: 73 additions & 0 deletions lib/pdf/reader/disjoint_set.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# coding: utf-8
# typed: strict
# frozen_string_literal: true

module PDF
class Reader

# In computer science, a disjoint-set data structure, also called a union–find data structure or merge–find set,
# is a data structure that stores a collection of disjoint (non-overlapping) sets.
class DisjointSet
include Enumerable

def initialize
@parents = {}
@ranks = {}
end

def contains(item)
@parents.key?(item)
end

def each(&block)
return enum_for(:each) unless block_given?

@parents.each_key(&block)
end

def length
@parents.length
end

def add(x)
@parents[x] = x
@ranks[x] = 0
self
end

def find(x)
return x if @parents[x] == x

find(@parents[x])
end

def sets
cluster_parents = {}
@parents.each_key do |x|
p = find(x)
cluster_parents[p] = [] unless cluster_parents.key?(p)
cluster_parents[p].push(x)
end
cluster_parents.values
end

def union(x, y)
x_parent = find(x)
y_parent = find(y)

return self if x_parent == y_parent

if @ranks[x_parent] > @ranks[y_parent]
@parents[y_parent] = x_parent
elsif @ranks[y_parent] > @ranks[x_parent]
@parents[x_parent] = y_parent
else
@parents[y_parent] = x_parent
@ranks[x_parent] += 1
end

self
end
end
end
end
39 changes: 39 additions & 0 deletions lib/pdf/reader/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,45 @@ def rectangles
}
end

# returns all text on the page as an array of Paragraphs.
def paragraphs(opts = {})
minimum_horizontal_overlap_percentage = opts.fetch(:minimum_horizontal_overlap_percentage, 0.80)
maximum_multiplied_leading = opts.fetch(:maximum_multiplied_leading, 1.40)
maximum_allowed_font_difference = opts.fetch(:maximum_allowed_font_difference, 1.00)

disjoint_set = PDF::Reader::DisjointSet.new
runs(opts).each { |run| disjoint_set.add(run) }

# Build disjoint set in order to find all text runs that "overlap" by a
# certain percentage, so we can combine the right runs together.
disjoint_set.each do |l0|
disjoint_set.each do |l1|
next if l0 == l1
next if disjoint_set.find(l0) == disjoint_set.find(l1)

overlap_percentage = l0.horizontal_overlap(l1)
leading = (l0.y - l1.y).abs / [l0.font_size, l1.font_size].min

next unless overlap_percentage >= minimum_horizontal_overlap_percentage
next unless leading <= maximum_multiplied_leading
next if (l0.font_size - l1.font_size).abs > maximum_allowed_font_difference

disjoint_set.union(l0, l1)
end
end

paragraphs = disjoint_set.sets.map do |set|
# remember, pdf page origin is bottom left corner
leftmost_x = set.map(&:x).min
topmost_y = set.map(&:y).max
text = set.map { |run| run.text.strip }.join(' ')

PDF::Reader::Paragraph.new(text, PDF::Reader::Point.new(leftmost_x, topmost_y))
end

paragraphs.map(&:text)
end

private

def root
Expand Down
18 changes: 18 additions & 0 deletions lib/pdf/reader/paragraph.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# coding: utf-8
# typed: true
# frozen_string_literal: true

module PDF
class Reader

# A simple class used by PDF::Reader::Page.paragraphs to represent a paragraph of text and its origin.
class Paragraph
attr_reader :text, :origin

def initialize(text, origin)
@text = text
@origin = origin
end
end
end
end
11 changes: 11 additions & 0 deletions lib/pdf/reader/text_run.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ def intersection_area_percent(other_run)
intersection_area.to_f / area
end

# return what percentage of this text run is overlapped by another run horizontally
def horizontal_overlap(other_run)
# rectangles do not overlap (we are on the left side)
return 0 if [x, endx].max < [other_run.x, other_run.endx].min
# rectangles do not overlap (other_run is on the left side)
return 0 if [other_run.x, other_run.endx].max < [x, endx].min
a = [ [x, endx].min, [other_run.x, other_run.endx].min ].max
b = [ [x, endx].max, [other_run.x, other_run.endx].max ].min
return (a - b).abs
end

private

def area
Expand Down
49 changes: 49 additions & 0 deletions rbi/pdf-reader.rbi
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,38 @@ module PDF
def bfrange_type_two(start_code, end_code, dst); end
end

class DisjointSet
include Enumerable
Elem = type_member { {fixed: T.untyped} }

sig { void }
def initialize
@parents = T.let({}, T::Hash[T.anything, T.untyped])
@ranks = T.let({}, T::Hash[T.anything, T.untyped])
end

sig { params(item: T.anything).returns(T::Boolean) }
def contains(item); end

sig { override.params(block: T.nilable).returns(T.any(T::Hash[T.untyped, T.untyped], T::Enumerator[T.untyped])) }
def each(&block); end

sig { returns(Integer) }
def length; end

sig { params(x: T.untyped).returns(PDF::Reader::DisjointSet) }
def add(x); end

sig { type_parameters(:U).params(x: T.type_parameter(:U)).returns(T.type_parameter(:U)) }
def find(x); end

sig { returns(T::Array[T.untyped]) }
def sets; end

sig { params(x: T.untyped, y: T.untyped).returns(PDF::Reader::DisjointSet) }
def union(x, y); end
end

class Encoding
CONTROL_CHARS = T.let(T.unsafe(nil), T::Array[Integer])
UNKNOWN_CHAR = T.let(T.unsafe(nil), Integer)
Expand Down Expand Up @@ -931,6 +963,9 @@ module PDF
sig { returns(T::Hash[Symbol, PDF::Reader::Rectangle]) }
def rectangles; end

sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[String]) }
def paragraphs(opts = {}); end

sig { returns(T::Hash[Symbol, T.untyped]) }
def root; end

Expand Down Expand Up @@ -1198,6 +1233,17 @@ module PDF
OPERATORS = T.let(T.unsafe(nil), T::Hash[String, Symbol])
end

class Paragraph
sig { returns(String) }
attr_reader :text

sig { returns(PDF::Reader::Point) }
attr_reader :origin

sig { params(text: String, origin: PDF::Reader::Point).void }
def initialize(text, origin); end
end

class Parser
sig { params(buffer: PDF::Reader::Buffer, objects: T.nilable(PDF::Reader::ObjectHash)).void }
def initialize(buffer, objects=nil); end
Expand Down Expand Up @@ -1577,6 +1623,9 @@ module PDF
sig { params(other_run: T.untyped).returns(Numeric) }
def intersection_area_percent(other_run); end

sig { params(other_run: T.untyped).returns(Numeric) }
def horizontal_overlap(other_run); end

sig { returns(Numeric) }
def area; end

Expand Down
97 changes: 97 additions & 0 deletions spec/disjoint_set_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# typed: false
# coding: utf-8

describe PDF::Reader::DisjointSet do
let(:set) { PDF::Reader::DisjointSet.new }

describe "#add" do
it "adds a new item to the set" do
set.add(5)
expect(set.length).to eq(1)
expect(set.contains(5)).to be_truthy
end
end

describe "#each" do
let(:set) do
set = PDF::Reader::DisjointSet.new
set.add(1)
set.add(2)
set.add(3)
set.union(1, 2)
end

it "iterates over each item in the set (even if unions are created)" do
expect(set.each.to_a).to eq([1, 2, 3])
end

it "is used by Enumerable to provide iterative functionality like #map" do
result = set.map { |x| x.to_s }
expect(result).to eq(['1', '2', '3'])
end
end

describe "#find" do
it "finds the parent of the item" do
set.add("parent")
set.add("child")
set.union("parent", "child")
expect(set.find("parent")).to eq("parent")
expect(set.find("child")).to eq("parent")
end

it "returns the item if it is a parent" do
set.add("item")
expect(set.find("item")).to eq("item")
end
end

describe "#sets" do
it "returns an array of arrays containing the sets" do
set.add("parent")
set.add("child")
set.add("unrelated")
set.union("parent", "child")
expect(set.sets).to eq([["parent", "child"], ["unrelated"]])
end
end

describe "#union" do
let(:set) do
set = PDF::Reader::DisjointSet.new
set.add("parent")
set.add("child")
set.add("grandchild")
set.add("unrelated")
end

it "handles multiple unions" do
set.union("parent", "child")
set.union("child", "grandchild")
expect(set.sets).to eq([["parent", "child", "grandchild"], ["unrelated"]])
end

it "handles union params regardless of order" do
set.union("child", "parent")
set.union("grandchild", "child")
expect(set.sets).to eq([["parent", "child", "grandchild"], ["unrelated"]])
end

it "gracefully handles union of identical elements" do
set.union("child", "child")
expect(set.sets).to eq([["parent"], ["child"], ["grandchild"], ["unrelated"]])
end

it "handles joining multiple previous unions" do
set = PDF::Reader::DisjointSet.new
set.add("parent1")
set.add("child1")
set.add("parent2")
set.add("child2")
set.union("parent1", "child1")
set.union("parent2", "child2")
set.union("parent1", "parent2")
expect(set.sets).to eq([["parent1", "child1", "parent2", "child2"]])
end
end
end
36 changes: 36 additions & 0 deletions spec/page_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,42 @@
end
end

describe "#paragraphs page 1" do
let!(:page) { browser.page(1) }

context "of cairo-basic.pdf" do
let!(:browser) { PDF::Reader.new(pdf_spec_file("cairo-basic")) }

it "returns the text content" do
expect(page.paragraphs).to eql(["Hello James"])
end
end

context "of all_page_boxes_exist.pdf" do
let!(:browser) { PDF::Reader.new(pdf_spec_file("all_page_boxes_exist")) }

it "returns headlines as their own paragraph" do
expect(page.paragraphs).to include("PDF Automation")
end

it "returns actual paragraphs" do
expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " "))
PDF page boxes include Media Box, Trim Box and Bleed Box. Imposition
in the Sheridan work flow requires a Trim Box and a Bleed Box where
bleeds are present with a consistent Media Box.
TEXT
end

it "returns paragraphs from multi-column layouts" do
expect(page.paragraphs).to include(<<~TEXT.strip.gsub(/\n/, " "))
QuarkXPress Enter your trim size of the Width and the Height. Elements that bleed
must extend .125" (1/8") beyond the project’s trim edge in your project
layout.
TEXT
end
end
end

describe "#walk" do

context "with page 1 of cairo-basic.pdf" do
Expand Down