Skip to content

Commit

Permalink
Update notebooks (#522)
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayak-mehta authored Dec 30, 2024
2 parents 12687dc + bef80bf commit fb334ca
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
"source": [
"# Quickstart example\n",
"\n",
"This notebook shows you a quick way to get started with [pypdf_table_extraction](https://github.com/py-pdf/pypdf_table_extraction) .\n",
"This notebook shows you how to quickly get started with [camelot](https://github.com/camelot-dev/camelot) .\n",
"\n",
"**Usage:** Either upload files or provide a PDF URL in the specified cells."
"**Usage:** Either upload PDFs or add a URL to a PDF in the specified cells."
]
},
{
Expand All @@ -21,8 +21,8 @@
},
"outputs": [],
"source": [
"# @title 🛠️ Install [pypdf_table_extraction](https://github.com/py-pdf/pypdf_table_extraction)\n",
"!pip install pypdf-table-extraction\n",
"# @title 🛠️ Install [camelot](https://github.com/camelot-dev/camelot)\n",
"!pip install camelot-py\n",
"# install tabulate (optional) only needed in this notebook for pretty display of results.\n",
"!pip install tabulate"
]
Expand All @@ -38,11 +38,9 @@
"\n",
"sys.path.insert(0, os.path.abspath(\"\")) # Prefer the local version if available\n",
"\n",
"import pypdf_table_extraction\n",
"import camelot\n",
"\n",
"print(\n",
" f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n",
")"
"print(f\"Using camelot v{camelot.__version__}.\")"
]
},
{
Expand Down Expand Up @@ -154,6 +152,7 @@
"\n",
"# import os\n",
"import requests\n",
"\n",
"# from pathlib import Path\n",
"\n",
"\n",
Expand All @@ -168,7 +167,7 @@
"\n",
"\n",
"# Sample .pdf data from GitHub\n",
"pdf_url = \"https://github.com/py-pdf/pypdf_table_extraction/blob/main/docs/_static/pdf/foo.pdf\" # @param {type:\"string\"}\n",
"pdf_url = \"https://github.com/camelot-dev/camelot/blob/main/docs/_static/pdf/foo.pdf\" # @param {type:\"string\"}\n",
"\n",
"# Convert the GitHub URL to the raw content URL\n",
"pdf_url = convert_github_url_to_raw(pdf_url)\n",
Expand Down Expand Up @@ -206,7 +205,6 @@
},
"outputs": [],
"source": [
"# import pypdf_table_extraction\n",
"import logging\n",
"import pandas as pd\n",
"\n",
Expand Down Expand Up @@ -235,9 +233,9 @@
" logging.error(f\"Failed to open PDF {pdf_file.name} with PdfReader: {e}\")\n",
" return\n",
"\n",
" # Read tables from the PDF using pypdf_table_extraction\n",
" # Read tables from the PDF using camelot\n",
" try:\n",
" tables = pypdf_table_extraction.read_pdf(str(pdf_file))\n",
" tables = camelot.read_pdf(str(pdf_file))\n",
" except Exception as e:\n",
" print(f\"Failed to read PDF {pdf_file.name}: {e}\")\n",
" logging.error(f\"Failed to read PDF {pdf_file.name}: {e}\")\n",
Expand Down Expand Up @@ -316,7 +314,6 @@
"source": [
"# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output\n",
"\n",
"# import pypdf_table_extraction\n",
"# import os\n",
"# from pathlib import Path\n",
"import pandas as pd\n",
Expand All @@ -333,11 +330,11 @@
" print(f\"\\nProcessing {pdf_file.name}\")\n",
"\n",
" # Using 'network' parsing method with table_areas\n",
" tables_network = pypdf_table_extraction.read_pdf(str(pdf_file), flavor=\"network\")\n",
" tables_network = camelot.read_pdf(str(pdf_file), flavor=\"network\")\n",
"\n",
" if len(tables_network) == 0:\n",
" # If no tables are detected, try using 'lattice' parser\n",
" tables_lattice = pypdf_table_extraction.read_pdf(\n",
" tables_lattice = camelot.read_pdf(\n",
" str(pdf_file), flavor=\"lattice\", table_areas=[\"50,750,500,50\"]\n",
" )\n",
"\n",
Expand Down Expand Up @@ -395,6 +392,7 @@
"# @title 🗑️ Clear Input & Output Directory\n",
"\n",
"import shutil\n",
"\n",
"# from pathlib import Path\n",
"# import os\n",
"\n",
Expand Down Expand Up @@ -447,7 +445,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.13.0"
}
},
"nbformat": 4,
Expand Down
49 changes: 17 additions & 32 deletions examples/hybrid-parser-step-by-step.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup pypdf_table_extraction"
"# Install camelot"
]
},
{
Expand All @@ -25,20 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.getcwd()\n",
"# Install from source\n",
"!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n",
"%cd src\n",
"\n",
"\n",
"!pip install -e .\n",
"\n",
"# Optionally you can Install ghostscript as the imageconversion backend.\n",
"# uncomment the following lines\n",
"# !apt-get install -y ghostscript\n",
"# !pip install ghostscript"
"!pip install camelot-py"
]
},
{
Expand All @@ -47,7 +34,7 @@
"source": [
"# Select a PDF file to analyze\n",
"\n",
"You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it. By default, it points to one of the test .pdfs included with pypdf_table_extraction."
"You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it. By default, it points to one of the test .pdfs included with camelot."
]
},
{
Expand All @@ -63,12 +50,10 @@
"\n",
"sys.path.insert(\n",
" 0, os.path.abspath(\"\")\n",
") # Prefer the local version of pypdf_table_extraction if available\n",
"import pypdf_table_extraction\n",
") # Prefer the local version of camelot if available\n",
"import camelot\n",
"\n",
"print(\n",
" f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n",
")\n",
"print(f\"Using camelot v{camelot.__version__}.\")\n",
"\n",
"# Select a pdf to analyze.\n",
"kwargs = {}\n",
Expand Down Expand Up @@ -127,7 +112,7 @@
"# Set up plotting options\n",
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline\n",
"# %matplotlib inline\n",
"PLOT_HEIGHT = 12\n",
"\n",
"\n",
Expand Down Expand Up @@ -182,12 +167,12 @@
"# Parse file\n",
"flavor = \"network\"\n",
"timer_before_parse = time.perf_counter()\n",
"tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"timer_after_parse = time.perf_counter()\n",
"\n",
"if tables is not None:\n",
" fig, ax = init_figure_and_axis(f\"Text elements in PDF\\n{pdf_file}\")\n",
" pypdf_table_extraction.plot(tables[0], kind=\"text\", ax=ax)\n",
" camelot.plot(tables[0], kind=\"text\", ax=ax)\n",
"else:\n",
" print(\"No table found for this document.\")"
]
Expand All @@ -213,7 +198,7 @@
"source": [
"if tables is not None:\n",
" fig, ax = init_figure_and_axis(f\"Text edges in PDF\\n{pdf_file}\")\n",
" pypdf_table_extraction.plot(tables[0], kind=\"textedge\", ax=ax)\n",
" camelot.plot(tables[0], kind=\"textedge\", ax=ax)\n",
"else:\n",
" print(f\"No table found for document {pdf_file}.\")"
]
Expand Down Expand Up @@ -243,7 +228,7 @@
"source": [
"if tables is not None:\n",
" fig, ax = init_figure_and_axis(f\"Growth steps for table in PDF\\n{pdf_file}\")\n",
" pypdf_table_extraction.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
" camelot.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
"else:\n",
" print(\"No table found for this document.\")"
]
Expand Down Expand Up @@ -292,12 +277,12 @@
"# Parse file\n",
"flavor = \"lattice\"\n",
"timer_before_parse = time.perf_counter()\n",
"tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"timer_after_parse = time.perf_counter()\n",
"\n",
"if tables is not None:\n",
" fig, ax = init_figure_and_axis(f\"Line structure in PDF\\n{pdf_file}\")\n",
" pypdf_table_extraction.plot(tables[0], kind=\"line\", ax=ax)\n",
" camelot.plot(tables[0], kind=\"line\", ax=ax)\n",
"else:\n",
" print(\"No table found for this document.\")"
]
Expand All @@ -319,7 +304,7 @@
"source": [
"for table in tables:\n",
" fig, ax = init_figure_and_axis(f\"Contour structure in PDF\\n{pdf_file}\")\n",
" pypdf_table_extraction.plot(table, kind=\"contour\", ax=ax)"
" camelot.plot(table, kind=\"contour\", ax=ax)"
]
},
{
Expand All @@ -339,7 +324,7 @@
"source": [
"for table in tables:\n",
" fig, ax = init_figure_and_axis(f\"Joint structure in PDF\\n{pdf_file}\")\n",
" pypdf_table_extraction.plot(table, kind=\"joint\", ax=ax)"
" camelot.plot(table, kind=\"joint\", ax=ax)"
]
},
{
Expand Down Expand Up @@ -392,7 +377,7 @@
"source": [
"flavor = \"hybrid\"\n",
"timer_before_parse = time.perf_counter()\n",
"tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
"timer_after_parse = time.perf_counter()\n",
"\n",
"display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
Expand All @@ -416,7 +401,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.13.0"
},
"mimetype": "text/x-python",
"name": "python",
Expand Down
35 changes: 9 additions & 26 deletions examples/parser-comparison-notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup pypdf_table_extraction"
"# Install camelot"
]
},
{
Expand All @@ -22,20 +22,7 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.getcwd()\n",
"# Install from source\n",
"!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n",
"%cd src\n",
"\n",
"\n",
"!pip install -e .\n",
"\n",
"# Optionally you can Install ghostscript as the imageconversion backend.\n",
"# uncomment the following lines\n",
"# !apt-get install -y ghostscript\n",
"# !pip install ghostscript"
"!pip install camelot-py"
]
},
{
Expand All @@ -51,12 +38,10 @@
"\n",
"sys.path.insert(\n",
" 0, os.path.abspath(\"\")\n",
") # Prefer the local version of pypdf_table_extraction if available\n",
"import pypdf_table_extraction\n",
") # Prefer the local version of camelot if available\n",
"import camelot\n",
"\n",
"print(\n",
" f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n",
")"
"print(f\"Using camelot v{camelot.__version__}.\")"
]
},
{
Expand All @@ -65,7 +50,7 @@
"source": [
"## Select a PDF file to review\n",
"\n",
"You can modify the section below to point to a pdf or your choice to visualize the results. By default, it points to one of the test .pdfs included with pypdf_table_extraction.\n",
"You can modify the section below to point to a pdf or your choice to visualize the results. By default, it points to one of the test .pdfs included with camelot.\n",
"This is seeded with the unit test files for convenience."
]
},
Expand Down Expand Up @@ -145,9 +130,7 @@
" timer_before_parse = time.perf_counter()\n",
" error, tables = None, []\n",
" try:\n",
" tables = pypdf_table_extraction.read_pdf(\n",
" filename, flavor=flavor, debug=True, **kwargs\n",
" )\n",
" tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
" except ValueError as value_error:\n",
" error = f\"Invalid argument for parser {flavor}: {value_error}\"\n",
" print(error)\n",
Expand Down Expand Up @@ -195,7 +178,7 @@
"# Set up plotting options\n",
"import matplotlib.pyplot as plt\n",
"\n",
"%matplotlib inline\n",
"# %matplotlib inline\n",
"PLOT_HEIGHT = 12\n",
"\n",
"row_count = max(max_tables, 1)\n",
Expand Down Expand Up @@ -260,7 +243,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
"version": "3.13.0"
},
"mimetype": "text/x-python",
"name": "python",
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ readme = "README.md"
classifiers = [
"Development Status :: 5 - Production/Stable",
]
requires-python = ">=3.9"
requires-python = ">=3.8"
dependencies = [
"click>=8.0.1",
"chardet>=5.1.0",
Expand Down

0 comments on commit fb334ca

Please sign in to comment.