From 84403798466858abbb530be70cb4063c2a23bc9a Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Fri, 8 Mar 2024 16:38:40 -0400 Subject: [PATCH] Several fixes for table module - Add new method for outputting the table as a markdown string. - Address errors in computing the table header object: We now allow None as the cell value, because this will be resolved where needed (e.g. in the pandas DataFrame). We previously tried to enforce rect-like tuples in all header cell bboxes, however this fails for tables with all-None columns. This fix enables this and constructs an empty string in the corresponding cell string. We now correctly include start / stop points of lines in the bbox of the clustered graphic. We previously joined the line's rectangle - which had no effect because this is always empty. --- docs/page.rst | 29 ++++++------ src/table.py | 103 ++++++++++++++++++++++++------------------- tests/test_tables.py | 22 ++++++--- 3 files changed, 89 insertions(+), 65 deletions(-) diff --git a/docs/page.rst b/docs/page.rst index c8c3e805f..08888c4e9 100644 --- a/docs/page.rst +++ b/docs/page.rst @@ -419,27 +419,28 @@ In a nutshell, this is what you can do with PyMuPDF: :returns: a `TableFinder` object that has the following significant attributes: - * **cells:** a list of **all bboxes** on the page, that have been identified as table cells (across all tables). Each cell is a tuple `(x0, y0, x1, y1)` of coordinates or `None`. - * **tables:** a list of `Table` objects. This is `[]` if the page has no tables. Single tables can be found as items of this list. But the `TableFinder` object itself is also a sequence of its tables. This means that if `tabs` is a `TableFinder` object, then table "n" is delivered by `tabs.tables[n]` as well as by the shorter `tabs[n]`. + * `cells`: a list of **all bboxes** on the page, that have been identified as table cells (across all tables). Each cell is a :data:`rect_like` tuple `(x0, y0, x1, y1)` of coordinates or `None`. + * `tables`: a list of `Table` objects. This is `[]` if the page has no tables. Single tables can be found as items of this list. But the `TableFinder` object itself is also a sequence of its tables. This means that if `tabs` is a `TableFinder` object, then table "n" is delivered by `tabs.tables[n]` as well as by the shorter `tabs[n]`. * The `Table` object has the following attributes: - * **bbox:** the bounding box of the table as a tuple `(x0, y0, x1, y1)`. - * **cells:** bounding boxes of the table's cells (list of tuples). A cell may also be `None`. - * **extract():** this method returns the text content of each table cell as a list of list of strings. - * **to_pandas():** this method returns the table as a `pandas `_ `DataFrame `_. - * **header:** a `TableHeader` object containing header information of the table. - * **col_count:** an integer containing the number of table columns. - * **row_count:** an integer containing the number of table rows. - * **rows:** a list of `TableRow` objects containing two attributes: *bbox* is the boundary box of the row, and *cells* is a list of table cells contained in this row. + * `bbox`: the bounding box of the table as a tuple `(x0, y0, x1, y1)`. + * `cells`: bounding boxes of the table's cells (list of tuples). A cell may also be `None`. + * `extract()`: this method returns the text content of each table cell as a list of list of strings. + * `to_markdown()`: this method returns the table as a **string in markdown format** (compatible to Github). Supporting viewers can render the string as a table. This output is optimized for **small token** sizes, which is especially beneficial for LLM/RAG feeds. Pandas DataFrames (see method `to_pandas()` below) offer an equivalent markdown table output which however is better readable for the human eye. + * `to_pandas()`: this method returns the table as a `pandas `_ `DataFrame `_. DataFrames are very versatile objects allowing a plethora of table manipulation methods and outputs to almost 20 well-known formats, among them Excel files, CSV, JSON, markdown-formatted tables and more. `DataFrame.to_markdown()` generates a Github-compatible markdown format optimized for human readability. This method however requires the package [tablutate](https://pypi.org/project/tabulate/) to installed in addition to pandas itself. + * ``header``: a `TableHeader` object containing header information of the table. + * `col_count`: an integer containing the number of table columns. + * `row_count`: an integer containing the number of table rows. + * `rows`: a list of `TableRow` objects containing two attributes, ``bbox`` is the boundary box of the row, and `cells` is a list of table cells contained in this row. * The `TableHeader` object has the following attributes: - * **bbox:** the bounding box of the header. - * **cells:** a list of bounding boxes containing the name of the respective column. - * **names:** a list of strings containing the text of each of the cell bboxes. They represent the column names -- which can be used when exporting the table to pandas DataFrames or CSV, etc. - * **external:** a bool indicating whether the header bbox is outside the table body (`True`) or not. Table headers are never identified by the `TableFinder` logic. Therefore, if *external* is true, then the header cells are not part of any cell identified by `TableFinder`. If `external == False`, then the first table row is the header. + * ``bbox``: the bounding box of the header. + * `cells`: a list of bounding boxes containing the name of the respective column. + * `names`: a list of strings containing the text of each of the cell bboxes. They represent the column names -- which are used when exporting the table to pandas DataFrames, markdown, etc. + * `external`: a bool indicating whether the header bbox is outside the table body (`True`) or not. Table headers are never identified by the `TableFinder` logic. Therefore, if `external` is true, then the header cells are not part of any cell identified by `TableFinder`. If `external == False`, then the first table row is the header. Please have a look at these `Jupyter notebooks `_, which cover standard situations like multiple tables on one page or joining table fragments across multiple pages. diff --git a/src/table.py b/src/table.py index d95aee933..aa0bdae7c 100644 --- a/src/table.py +++ b/src/table.py @@ -1365,6 +1365,46 @@ def char_in_bbox(char, bbox) -> bool: return table_arr + def to_markdown(self, clean=True): + """Output table content as a string in Github-markdown format. + + If clean is true, markdown syntax is removed from cell content.""" + output = "|" + + # generate header string and MD underline + for i, name in enumerate(self.header.names): + if name is None or name == "": # generate a name if empty + name = f"Col{i+1}" + name = name.replace("\n", " ") # remove any line breaks + if clean: # remove sensitive syntax + name = ( + name.replace("<", "<").replace(">", ">").replace("-", "-") + ) + output += name + "|" + + output += "\n" + output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n" + + # skip first row in details if header is part of the table + j = 0 if self.header.external else 1 + + # iterate over detail rows + for row in self.extract()[j:]: + line = "|" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + if clean: # remove sensitive syntax + cell = ( + cell.replace("<", "<") + .replace(">", ">") + .replace("-", "-") + ) + line += cell + "|" + line += "\n" + output += line + return output + "\n" + def to_pandas(self, **kwargs): """Return a pandas DataFrame version of the table.""" try: @@ -1443,43 +1483,6 @@ def top_row_is_bold(bbox): return True return False - def recover_top_row_cells(table): - """Recreates top row cells if 'None' columns are present. - - We need all column x-coordinates even when the top table row - contains None cells. - """ - bbox = Rect(table.rows[0].bbox) # top row bbox - tbbox = Rect(table.bbox) # table bbox - y0, y1 = bbox.y0, bbox.y1 # top row upper / lower coordinates - - # make sure row0 bbox has the full table width - bbox.x0 = tbbox.x0 - bbox.x1 = tbbox.x1 - - l_r = set() # (x0, x1) pairs for all table cells - for cell in table.cells: - if cell is None: # skip non-existing cells - continue - cellbb = Rect(cell) - - # only accept cells wider than a character - if 10 < cellbb.width < tbbox.width: - l_r.add((cell[0], cell[2])) - - # sort (x0, x1) pairs by x0-values - l_r = sorted(list(l_r), key=lambda c: c[0]) - if not l_r: - return [], (0, 0, 0, 0) - - # recovered row 0 cells - cells = [(l_r[0][0], y0, l_r[0][1], y1)] - - for x0, x1 in l_r[1:]: - if x0 >= cells[-1][2]: - cells.append((x0, y0, x1, y1)) - return cells, bbox - try: row = self.rows[0] cells = row.cells @@ -1487,9 +1490,6 @@ def recover_top_row_cells(table): except IndexError: # this table has no rows return None - if None in cells: # if row 0 has empty cells, repair it - cells, bbox = recover_top_row_cells(self) - # return this if we determine that the top row is the header header_top_row = TableHeader(bbox, cells, self.extract()[0], False) @@ -1501,7 +1501,9 @@ def recover_top_row_cells(table): if len(cells) < 2: return header_top_row - col_x = [c[2] for c in cells[:-1]] # column (x) coordinates + col_x = [ + c[2] if c is not None else None for c in cells[:-1] + ] # column (x) coordinates # Special check: is top row bold? # If first line above table is not bold, but top-left table cell is bold, @@ -1600,6 +1602,7 @@ def recover_top_row_cells(table): intersecting = [ (x, r) for x in col_x + if x is not None for r in word_rects if r[1] == top and r[0] < x and r[2] > x ] @@ -1613,15 +1616,22 @@ def recover_top_row_cells(table): hdr_bbox = +clip # compute the header cells hdr_bbox.y0 = select[-1] # hdr_bbox top is smallest top coord of words - hdr_cells = [(c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) for c in cells] + hdr_cells = [ + (c[0], hdr_bbox.y0, c[2], hdr_bbox.y1) if c is not None else None + for c in cells + ] # adjust left/right of header bbox - hdr_bbox.x0 = hdr_cells[0][0] - hdr_bbox.x1 = hdr_cells[-1][2] + hdr_bbox.x0 = self.bbox[0] + hdr_bbox.x1 = self.bbox[2] # column names: no line breaks, no excess spaces hdr_names = [ - page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip() + ( + page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip() + if c is not None + else "" + ) for c in hdr_cells ] return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True) @@ -2012,7 +2022,8 @@ def clean_graphics(): repeat = False # set to true again if some other rect touches for i in range(len(prects) - 1, 0, -1): # run backwards if are_neighbors(prect0, prects[i]): # close enough to rect 0? - prect0 |= prects[i] # extend rect 0 + prect0 |= prects[i].tl # extend rect 0 + prect0 |= prects[i].br # extend rect 0 del prects[i] # delete this rect repeat = True # keep checking the rest diff --git a/tests/test_tables.py b/tests/test_tables.py index e4b684a2a..a785042ba 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -15,11 +15,23 @@ def test_table1(): doc = fitz.open(filename) page = doc[0] tabs = page.find_tables() - cells = [tabs[0].cells, tabs[1].cells] - extracts = [tabs[0].extract(), tabs[1].extract()] - new_data = {"cells": cells, "extracts": extracts} - old_data = pickle.load(pickle_in) - assert old_data == new_data + cells = tabs[0].cells + tabs[1].cells # all table cell tuples on page + extracts = [tabs[0].extract(), tabs[1].extract()] # all table cell content + old_data = pickle.load(pickle_in) # previously saved data + + # Compare cell contents + assert old_data["extracts"] == extracts # same cell contents + + # Compare cell coordinates. + # Cell rectangles may get somewhat larger due to more cautious border + # computations, but any differences must be small. + old_cells = old_data["cells"][0] + old_data["cells"][1] + assert len(cells) == len(old_cells) + for i in range(len(cells)): + c1 = fitz.Rect(cells[i]) # new cell coordinates + c0 = fitz.Rect(old_cells[i]) # old cell coordinates + assert c0 in c1 # always: old contained in new + assert abs(c1 - c0) < 0.2 # difference must be small def test_table2():