From 293c2118e865a885e79bd5dc777dd1c975c8b37b Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Thu, 23 May 2024 15:40:19 -0700 Subject: [PATCH] fix: disable table_as_cells output by default <- Ingest test fixtures update (#3094) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: badGarnet --- .../layout-parser-paper-with-table.json | 170 -------- .../layout-parser-paper.json | 382 ------------------ 2 files changed, 552 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json index 78e2ebd6e6..f00183ac2b 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json @@ -49,176 +49,6 @@ "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", "metadata": { "text_as_html": "
Dataset| Base Model!|Large Model| Notes
PubLayNet [33]P/MMLayouts of modern scientific documents
PRImA [3]MLayouts of scanned modern magazines and scientific reports
Newspaper [17]PLayouts of scanned US newspapers from the 20th century
TableBank [18]PTable region on modern scientific and business document
HIDataset [31]P/MLayouts of history Japanese documents
", - "table_as_cells": [ - { - "x": 0, - "y": 0, - "w": 1, - "h": 1, - "content": "Dataset" - }, - { - "x": 0, - "y": 1, - "w": 1, - "h": 1, - "content": "PubLayNet [33]" - }, - { - "x": 0, - "y": 2, - "w": 1, - "h": 1, - "content": "PRImA [3]" - }, - { - "x": 0, - "y": 3, - "w": 1, - "h": 1, - "content": "Newspaper [17]" - }, - { - "x": 0, - "y": 4, - "w": 1, - "h": 1, - "content": "TableBank [18]" - }, - { - "x": 0, - "y": 5, - "w": 1, - "h": 1, - "content": "HIDataset [31]" - }, - { - "x": 1, - "y": 0, - "w": 1, - "h": 1, - "content": "| Base Model!|" - }, - { - "x": 1, - "y": 1, - "w": 1, - "h": 1, - "content": "P/M" - }, - { - "x": 1, - "y": 2, - "w": 1, - "h": 1, - "content": "M" - }, - { - "x": 1, - "y": 3, - "w": 1, - "h": 1, - "content": "P" - }, - { - "x": 1, - "y": 4, - "w": 1, - "h": 1, - "content": "P" - }, - { - "x": 1, - "y": 5, - "w": 1, - "h": 1, - "content": "P/M" - }, - { - "x": 2, - "y": 0, - "w": 1, - "h": 1, - "content": "Large Model" - }, - { - "x": 2, - "y": 1, - "w": 1, - "h": 1, - "content": "M" - }, - { - "x": 2, - "y": 2, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 3, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 4, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 5, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 3, - "y": 0, - "w": 1, - "h": 1, - "content": "| Notes" - }, - { - "x": 3, - "y": 1, - "w": 1, - "h": 1, - "content": "Layouts of modern scientific documents" - }, - { - "x": 3, - "y": 2, - "w": 1, - "h": 1, - "content": "Layouts of scanned modern magazines and scientific reports" - }, - { - "x": 3, - "y": 3, - "w": 1, - "h": 1, - "content": "Layouts of scanned US newspapers from the 20th century" - }, - { - "x": 3, - "y": 4, - "w": 1, - "h": 1, - "content": "Table region on modern scientific and business document" - }, - { - "x": 3, - "y": 5, - "w": 1, - "h": 1, - "content": "Layouts of history Japanese documents" - } - ], "filetype": "image/jpeg", "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json index 5ff6322270..03a9917a81 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json @@ -841,134 +841,6 @@ "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents", "metadata": { "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
", - "table_as_cells": [ - { - "x": 0, - "y": 0, - "w": 1, - "h": 1, - "content": "Dataset" - }, - { - "x": 0, - "y": 1, - "w": 1, - "h": 1, - "content": "PubLayNet B8]|" - }, - { - "x": 0, - "y": 2, - "w": 1, - "h": 1, - "content": "PRImA" - }, - { - "x": 0, - "y": 3, - "w": 1, - "h": 1, - "content": "Newspaper" - }, - { - "x": 0, - "y": 4, - "w": 1, - "h": 1, - "content": "TableBank" - }, - { - "x": 0, - "y": 5, - "w": 1, - "h": 1, - "content": "HJDataset" - }, - { - "x": 1, - "y": 0, - "w": 1, - "h": 1, - "content": "| Base Model'|" - }, - { - "x": 1, - "y": 1, - "w": 1, - "h": 1, - "content": "F/M" - }, - { - "x": 1, - "y": 2, - "w": 1, - "h": 1, - "content": "M" - }, - { - "x": 1, - "y": 3, - "w": 1, - "h": 1, - "content": "F" - }, - { - "x": 1, - "y": 4, - "w": 1, - "h": 1, - "content": "F" - }, - { - "x": 1, - "y": 5, - "w": 1, - "h": 1, - "content": "F/M" - }, - { - "x": 2, - "y": 0, - "w": 1, - "h": 1, - "content": "| Notes" - }, - { - "x": 2, - "y": 1, - "w": 1, - "h": 1, - "content": "Layouts of modern scientific documents" - }, - { - "x": 2, - "y": 2, - "w": 1, - "h": 1, - "content": "Layouts of scanned modern magazines and scientific report" - }, - { - "x": 2, - "y": 3, - "w": 1, - "h": 1, - "content": "Layouts of scanned US newspapers from the 20th century" - }, - { - "x": 2, - "y": 4, - "w": 1, - "h": 1, - "content": "Table region on modern scientific and business document" - }, - { - "x": 2, - "y": 5, - "w": 1, - "h": 1, - "content": "Layouts of history Japanese documents" - } - ], "filetype": "application/pdf", "languages": [ "eng" @@ -1520,260 +1392,6 @@ "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
", - "table_as_cells": [ - { - "x": 0, - "y": 0, - "w": 1, - "h": 1, - "content": "block.pad(top, bottom," - }, - { - "x": 0, - "y": 1, - "w": 1, - "h": 1, - "content": "block.scale(fx, fy)" - }, - { - "x": 0, - "y": 2, - "w": 1, - "h": 1, - "content": "block.shift(dx, dy)" - }, - { - "x": 0, - "y": 3, - "w": 1, - "h": 1, - "content": "block1.is_in(block2)" - }, - { - "x": 0, - "y": 4, - "w": 1, - "h": 1, - "content": "block1. intersect (block2)" - }, - { - "x": 0, - "y": 5, - "w": 1, - "h": 1, - "content": "block1.union(block2)" - }, - { - "x": 0, - "y": 6, - "w": 1, - "h": 1, - "content": "block1.relative_to(block2)" - }, - { - "x": 0, - "y": 7, - "w": 1, - "h": 1, - "content": "block1.condition_on(block2)" - }, - { - "x": 0, - "y": 8, - "w": 1, - "h": 1, - "content": "block. crop_image (image)" - }, - { - "x": 1, - "y": 0, - "w": 1, - "h": 1, - "content": "right," - }, - { - "x": 1, - "y": 1, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 2, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 3, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 4, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 5, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 6, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 7, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 1, - "y": 8, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 0, - "w": 1, - "h": 1, - "content": "left)" - }, - { - "x": 2, - "y": 1, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 2, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 3, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 4, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 5, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 6, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 7, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 2, - "y": 8, - "w": 1, - "h": 1, - "content": "" - }, - { - "x": 3, - "y": 0, - "w": 1, - "h": 1, - "content": "Enlarge the current block according to the input" - }, - { - "x": 3, - "y": 1, - "w": 1, - "h": 1, - "content": "Scale the current block given the ratio in x and y direction" - }, - { - "x": 3, - "y": 2, - "w": 1, - "h": 1, - "content": "Move the current block with the shift distances in x and y direction" - }, - { - "x": 3, - "y": 3, - "w": 1, - "h": 1, - "content": "Whether block] is inside of block2" - }, - { - "x": 3, - "y": 4, - "w": 1, - "h": 1, - "content": "Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs" - }, - { - "x": 3, - "y": 5, - "w": 1, - "h": 1, - "content": "Return the union region of blockl and block2. Coordinate type to be determined based on the inputs" - }, - { - "x": 3, - "y": 6, - "w": 1, - "h": 1, - "content": "Convert the absolute coordinates of block to relative coordinates to block2" - }, - { - "x": 3, - "y": 7, - "w": 1, - "h": 1, - "content": "Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates" - }, - { - "x": 3, - "y": 8, - "w": 1, - "h": 1, - "content": "Obtain the image segments in the block region" - } - ], "filetype": "application/pdf", "languages": [ "eng"