Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG + 1] Rename the kwarg table_area to table_areas #171

Merged
merged 1 commit into from
Oct 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def cli(ctx, *args, **kwargs):


@cli.command('lattice')
@click.option('-T', '--table_area', default=[], multiple=True,
@click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
Expand Down Expand Up @@ -95,8 +95,8 @@ def lattice(c, *args, **kwargs):
filepath = kwargs.pop('filepath')
kwargs.update(conf)

table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas
copy_text = list(kwargs['copy_text'])
kwargs['copy_text'] = None if not copy_text else copy_text
kwargs['shift_text'] = list(kwargs['shift_text'])
Expand All @@ -116,7 +116,7 @@ def lattice(c, *args, **kwargs):


@cli.command('stream')
@click.option('-T', '--table_area', default=[], multiple=True,
@click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-C', '--columns', default=[], multiple=True,
Expand All @@ -142,8 +142,8 @@ def stream(c, *args, **kwargs):
filepath = kwargs.pop('filepath')
kwargs.update(conf)

table_area = list(kwargs['table_area'])
kwargs['table_area'] = None if not table_area else table_area
table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas
columns = list(kwargs['columns'])
kwargs['columns'] = None if not columns else columns

Expand Down
2 changes: 1 addition & 1 deletion camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def read_pdf(filepath, pages='1', flavor='lattice', suppress_warnings=False,
Lattice is used by default.
suppress_warnings : bool, optional (default: False)
Prevent warnings from being emitted by Camelot.
table_area : list, optional (default: None)
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
Expand Down
10 changes: 5 additions & 5 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Lattice(BaseParser):

Parameters
----------
table_area : list, optional (default: None)
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
Expand Down Expand Up @@ -76,12 +76,12 @@ class Lattice(BaseParser):
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

"""
def __init__(self, table_area=None, process_background=False,
def __init__(self, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, line_close_tol=2,
joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, margins=(1.0, 0.5, 0.1), **kwargs):
self.table_area = table_area
self.table_areas = table_areas
self.process_background = process_background
self.line_size_scaling = line_size_scaling
self.copy_text = copy_text
Expand Down Expand Up @@ -244,9 +244,9 @@ def _generate_table_bbox(self):
self.threshold, direction='horizontal',
line_size_scaling=self.line_size_scaling, iterations=self.iterations)

if self.table_area is not None:
if self.table_areas is not None:
areas = []
for area in self.table_area:
for area in self.table_areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
Expand Down
16 changes: 8 additions & 8 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Stream(BaseParser):

Parameters
----------
table_area : list, optional (default: None)
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
Expand All @@ -50,10 +50,10 @@ class Stream(BaseParser):
For more information, refer `PDFMiner docs <https://euske.github.io/pdfminer/>`_.

"""
def __init__(self, table_area=None, columns=None, split_text=False,
def __init__(self, table_areas=None, columns=None, split_text=False,
flag_size=False, row_close_tol=2, col_close_tol=0,
margins=(1.0, 0.5, 0.1), **kwargs):
self.table_area = table_area
self.table_areas = table_areas
self.columns = columns
self._validate_columns()
self.split_text = split_text
Expand Down Expand Up @@ -241,15 +241,15 @@ def _join_columns(cols, text_x_min, text_x_max):
return cols

def _validate_columns(self):
if self.table_area is not None and self.columns is not None:
if len(self.table_area) != len(self.columns):
raise ValueError("Length of table_area and columns"
if self.table_areas is not None and self.columns is not None:
if len(self.table_areas) != len(self.columns):
raise ValueError("Length of table_areas and columns"
" should be equal")

def _generate_table_bbox(self):
if self.table_area is not None:
if self.table_areas is not None:
table_bbox = {}
for area in self.table_area:
for area in self.table_areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
Expand Down
6 changes: 3 additions & 3 deletions docs/user/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,13 @@ Specify table areas

Since :ref:`Stream <stream>` treats the whole page as a table, `for now`_, it's useful to specify table boundaries in cases such as `these <../_static/pdf/table_areas.pdf>`__. You can :ref:`plot the text <geometry_text>` on this page and note the top left and bottom right coordinates of the table.

Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_area`` keyword argument.
Table areas that you want Camelot to analyze can be passed as a list of comma-separated strings to :meth:`read_pdf() <camelot.read_pdf>`, using the ``table_areas`` keyword argument.

.. _for now: https://github.com/socialcopsdev/camelot/issues/102

::

>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_area=['316,499,566,337'])
>>> tables = camelot.read_pdf('table_areas.pdf', flavor='stream', table_areas=['316,499,566,337'])
>>> tables[0].df

.. csv-table::
Expand All @@ -172,7 +172,7 @@ You can pass the column separators as a list of comma-separated strings to :meth

In case you passed a single column separators string list, and no table area is specified, the separators will be applied to the whole page. When a list of table areas is specified and you need to specify column separators as well, **the length of both lists should be equal**. Each table area will be mapped to each column separators' string using their indices.

For example, if you have specified two table areas, ``table_area=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``.
For example, if you have specified two table areas, ``table_areas=['12,23,43,54', '20,33,55,67']``, and only want to specify column separators for the first table, you can pass an empty string for the second table in the column separators' list like this, ``columns=['10,120,200,400', '']``.

Let's get back to the *x* coordinates we got from :ref:`plotting text <geometry_text>` that exists on this `PDF <../_static/pdf/column_separators.pdf>`__, and get the table out!

Expand Down
4 changes: 2 additions & 2 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
["", "", "", "", "", "", "", "", "54", "", "", "", "", "", "", "", "", ""]
]

data_stream_table_area = [
data_stream_table_areas = [
["", "One Withholding"],
["Payroll Period", "Allowance"],
["Weekly", "$71.15"],
Expand Down Expand Up @@ -261,7 +261,7 @@
["Pooled", "38742", "53618", "60601", "86898", "4459", "21918", "27041", "14312", "18519"]
]

data_lattice_table_area = [
data_lattice_table_areas = [
["", "", "", "", "", "", "", "", ""],
["State", "n", "Literacy Status", "", "", "", "", "", ""],
["", "", "Illiterate", "Read & Write", "1-4 std.", "5-8 std.", "9-12 std.", "College", ""],
Expand Down
12 changes: 6 additions & 6 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ def test_stream_table_rotated():
assert df.equals(tables[0].df)


def test_stream_table_area():
df = pd.DataFrame(data_stream_table_area)
def test_stream_table_areas():
df = pd.DataFrame(data_stream_table_areas)

filename = os.path.join(testdir, "tabula/us-007.pdf")
tables = camelot.read_pdf(filename, flavor="stream", table_area=["320,500,573,335"])
tables = camelot.read_pdf(filename, flavor="stream", table_areas=["320,500,573,335"])
assert df.equals(tables[0].df)


Expand Down Expand Up @@ -100,11 +100,11 @@ def test_lattice_table_rotated():
assert df.equals(tables[0].df)


def test_lattice_table_area():
df = pd.DataFrame(data_lattice_table_area)
def test_lattice_table_areas():
df = pd.DataFrame(data_lattice_table_areas)

filename = os.path.join(testdir, "twotables_2.pdf")
tables = camelot.read_pdf(filename, table_area=["80,693,535,448"])
tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
assert df.equals(tables[0].df)


Expand Down
4 changes: 2 additions & 2 deletions tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def test_unsupported_format():


def test_stream_equal_length():
message = ("Length of table_area and columns"
message = ("Length of table_areas and columns"
" should be equal")
with pytest.raises(ValueError, message=message):
tables = camelot.read_pdf(filename, flavor='stream',
table_area=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])
table_areas=['10,20,30,40'], columns=['10,20,30,40', '10,20,30,40'])


def test_no_tables_found():
Expand Down