From 216bb6d7dd97038fa86fd61cde5f1c56c390ef0d Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 30 Dec 2024 11:33:30 +0100 Subject: [PATCH 1/3] Update notebooks --- ...pynb => camelot-quickstart-notebook.ipynb} | 30 ++++++------ examples/hybrid-parser-step-by-step.ipynb | 49 +++++++------------ examples/parser-comparison-notebook.ipynb | 35 ++++--------- 3 files changed, 40 insertions(+), 74 deletions(-) rename examples/{pypdf_table_extraction_quick_start_notebook.ipynb => camelot-quickstart-notebook.ipynb} (93%) diff --git a/examples/pypdf_table_extraction_quick_start_notebook.ipynb b/examples/camelot-quickstart-notebook.ipynb similarity index 93% rename from examples/pypdf_table_extraction_quick_start_notebook.ipynb rename to examples/camelot-quickstart-notebook.ipynb index ee369129..5b42246d 100644 --- a/examples/pypdf_table_extraction_quick_start_notebook.ipynb +++ b/examples/camelot-quickstart-notebook.ipynb @@ -8,9 +8,9 @@ "source": [ "# Quickstart example\n", "\n", - "This notebook shows you a quick way to get started with [pypdf_table_extraction](https://github.com/py-pdf/pypdf_table_extraction) .\n", + "This notebook shows you how to quickly get started with [camelot](https://github.com/camelot-dev/camelot) .\n", "\n", - "**Usage:** Either upload files or provide a PDF URL in the specified cells." + "**Usage:** Either upload PDFs or add a URL to a PDF in the specified cells." ] }, { @@ -21,8 +21,8 @@ }, "outputs": [], "source": [ - "# @title 🛠️ Install [pypdf_table_extraction](https://github.com/py-pdf/pypdf_table_extraction)\n", - "!pip install pypdf-table-extraction\n", + "# @title 🛠️ Install [camelot](https://github.com/camelot-dev/camelot)\n", + "!pip install camelot-py\n", "# install tabulate (optional) only needed in this notebook for pretty display of results.\n", "!pip install tabulate" ] @@ -38,11 +38,9 @@ "\n", "sys.path.insert(0, os.path.abspath(\"\")) # Prefer the local version if available\n", "\n", - "import pypdf_table_extraction\n", + "import camelot\n", "\n", - "print(\n", - " f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n", - ")" + "print(f\"Using camelot v{camelot.__version__}.\")" ] }, { @@ -154,6 +152,7 @@ "\n", "# import os\n", "import requests\n", + "\n", "# from pathlib import Path\n", "\n", "\n", @@ -168,7 +167,7 @@ "\n", "\n", "# Sample .pdf data from GitHub\n", - "pdf_url = \"https://github.com/py-pdf/pypdf_table_extraction/blob/main/docs/_static/pdf/foo.pdf\" # @param {type:\"string\"}\n", + "pdf_url = \"https://github.com/camelot-dev/camelot/blob/main/docs/_static/pdf/foo.pdf\" # @param {type:\"string\"}\n", "\n", "# Convert the GitHub URL to the raw content URL\n", "pdf_url = convert_github_url_to_raw(pdf_url)\n", @@ -206,7 +205,6 @@ }, "outputs": [], "source": [ - "# import pypdf_table_extraction\n", "import logging\n", "import pandas as pd\n", "\n", @@ -235,9 +233,9 @@ " logging.error(f\"Failed to open PDF {pdf_file.name} with PdfReader: {e}\")\n", " return\n", "\n", - " # Read tables from the PDF using pypdf_table_extraction\n", + " # Read tables from the PDF using camelot\n", " try:\n", - " tables = pypdf_table_extraction.read_pdf(str(pdf_file))\n", + " tables = camelot.read_pdf(str(pdf_file))\n", " except Exception as e:\n", " print(f\"Failed to read PDF {pdf_file.name}: {e}\")\n", " logging.error(f\"Failed to read PDF {pdf_file.name}: {e}\")\n", @@ -316,7 +314,6 @@ "source": [ "# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output\n", "\n", - "# import pypdf_table_extraction\n", "# import os\n", "# from pathlib import Path\n", "import pandas as pd\n", @@ -333,11 +330,11 @@ " print(f\"\\nProcessing {pdf_file.name}\")\n", "\n", " # Using 'network' parsing method with table_areas\n", - " tables_network = pypdf_table_extraction.read_pdf(str(pdf_file), flavor=\"network\")\n", + " tables_network = camelot.read_pdf(str(pdf_file), flavor=\"network\")\n", "\n", " if len(tables_network) == 0:\n", " # If no tables are detected, try using 'lattice' parser\n", - " tables_lattice = pypdf_table_extraction.read_pdf(\n", + " tables_lattice = camelot.read_pdf(\n", " str(pdf_file), flavor=\"lattice\", table_areas=[\"50,750,500,50\"]\n", " )\n", "\n", @@ -395,6 +392,7 @@ "# @title 🗑️ Clear Input & Output Directory\n", "\n", "import shutil\n", + "\n", "# from pathlib import Path\n", "# import os\n", "\n", @@ -447,7 +445,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.13.0" } }, "nbformat": 4, diff --git a/examples/hybrid-parser-step-by-step.ipynb b/examples/hybrid-parser-step-by-step.ipynb index 76cecfc9..78d71b58 100644 --- a/examples/hybrid-parser-step-by-step.ipynb +++ b/examples/hybrid-parser-step-by-step.ipynb @@ -16,7 +16,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Setup pypdf_table_extraction" + "# Install camelot" ] }, { @@ -25,20 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "\n", - "os.getcwd()\n", - "# Install from source\n", - "!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n", - "%cd src\n", - "\n", - "\n", - "!pip install -e .\n", - "\n", - "# Optionally you can Install ghostscript as the imageconversion backend.\n", - "# uncomment the following lines\n", - "# !apt-get install -y ghostscript\n", - "# !pip install ghostscript" + "!pip install camelot-py" ] }, { @@ -47,7 +34,7 @@ "source": [ "# Select a PDF file to analyze\n", "\n", - "You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it. By default, it points to one of the test .pdfs included with pypdf_table_extraction." + "You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it. By default, it points to one of the test .pdfs included with camelot." ] }, { @@ -63,12 +50,10 @@ "\n", "sys.path.insert(\n", " 0, os.path.abspath(\"\")\n", - ") # Prefer the local version of pypdf_table_extraction if available\n", - "import pypdf_table_extraction\n", + ") # Prefer the local version of camelot if available\n", + "import camelot\n", "\n", - "print(\n", - " f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n", - ")\n", + "print(f\"Using camelot v{camelot.__version__}.\")\n", "\n", "# Select a pdf to analyze.\n", "kwargs = {}\n", @@ -127,7 +112,7 @@ "# Set up plotting options\n", "import matplotlib.pyplot as plt\n", "\n", - "%matplotlib inline\n", + "# %matplotlib inline\n", "PLOT_HEIGHT = 12\n", "\n", "\n", @@ -182,12 +167,12 @@ "# Parse file\n", "flavor = \"network\"\n", "timer_before_parse = time.perf_counter()\n", - "tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", + "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", "timer_after_parse = time.perf_counter()\n", "\n", "if tables is not None:\n", " fig, ax = init_figure_and_axis(f\"Text elements in PDF\\n{pdf_file}\")\n", - " pypdf_table_extraction.plot(tables[0], kind=\"text\", ax=ax)\n", + " camelot.plot(tables[0], kind=\"text\", ax=ax)\n", "else:\n", " print(\"No table found for this document.\")" ] @@ -213,7 +198,7 @@ "source": [ "if tables is not None:\n", " fig, ax = init_figure_and_axis(f\"Text edges in PDF\\n{pdf_file}\")\n", - " pypdf_table_extraction.plot(tables[0], kind=\"textedge\", ax=ax)\n", + " camelot.plot(tables[0], kind=\"textedge\", ax=ax)\n", "else:\n", " print(f\"No table found for document {pdf_file}.\")" ] @@ -243,7 +228,7 @@ "source": [ "if tables is not None:\n", " fig, ax = init_figure_and_axis(f\"Growth steps for table in PDF\\n{pdf_file}\")\n", - " pypdf_table_extraction.plot(tables[0], kind=\"network_table_search\", ax=ax)\n", + " camelot.plot(tables[0], kind=\"network_table_search\", ax=ax)\n", "else:\n", " print(\"No table found for this document.\")" ] @@ -292,12 +277,12 @@ "# Parse file\n", "flavor = \"lattice\"\n", "timer_before_parse = time.perf_counter()\n", - "tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", + "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", "timer_after_parse = time.perf_counter()\n", "\n", "if tables is not None:\n", " fig, ax = init_figure_and_axis(f\"Line structure in PDF\\n{pdf_file}\")\n", - " pypdf_table_extraction.plot(tables[0], kind=\"line\", ax=ax)\n", + " camelot.plot(tables[0], kind=\"line\", ax=ax)\n", "else:\n", " print(\"No table found for this document.\")" ] @@ -319,7 +304,7 @@ "source": [ "for table in tables:\n", " fig, ax = init_figure_and_axis(f\"Contour structure in PDF\\n{pdf_file}\")\n", - " pypdf_table_extraction.plot(table, kind=\"contour\", ax=ax)" + " camelot.plot(table, kind=\"contour\", ax=ax)" ] }, { @@ -339,7 +324,7 @@ "source": [ "for table in tables:\n", " fig, ax = init_figure_and_axis(f\"Joint structure in PDF\\n{pdf_file}\")\n", - " pypdf_table_extraction.plot(table, kind=\"joint\", ax=ax)" + " camelot.plot(table, kind=\"joint\", ax=ax)" ] }, { @@ -392,7 +377,7 @@ "source": [ "flavor = \"hybrid\"\n", "timer_before_parse = time.perf_counter()\n", - "tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", + "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", "timer_after_parse = time.perf_counter()\n", "\n", "display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)" @@ -416,7 +401,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.13.0" }, "mimetype": "text/x-python", "name": "python", diff --git a/examples/parser-comparison-notebook.ipynb b/examples/parser-comparison-notebook.ipynb index f1269349..2f0f442d 100644 --- a/examples/parser-comparison-notebook.ipynb +++ b/examples/parser-comparison-notebook.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Setup pypdf_table_extraction" + "# Install camelot" ] }, { @@ -22,20 +22,7 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "\n", - "os.getcwd()\n", - "# Install from source\n", - "!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n", - "%cd src\n", - "\n", - "\n", - "!pip install -e .\n", - "\n", - "# Optionally you can Install ghostscript as the imageconversion backend.\n", - "# uncomment the following lines\n", - "# !apt-get install -y ghostscript\n", - "# !pip install ghostscript" + "!pip install camelot-py" ] }, { @@ -51,12 +38,10 @@ "\n", "sys.path.insert(\n", " 0, os.path.abspath(\"\")\n", - ") # Prefer the local version of pypdf_table_extraction if available\n", - "import pypdf_table_extraction\n", + ") # Prefer the local version of camelot if available\n", + "import camelot\n", "\n", - "print(\n", - " f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n", - ")" + "print(f\"Using camelot v{camelot.__version__}.\")" ] }, { @@ -65,7 +50,7 @@ "source": [ "## Select a PDF file to review\n", "\n", - "You can modify the section below to point to a pdf or your choice to visualize the results. By default, it points to one of the test .pdfs included with pypdf_table_extraction.\n", + "You can modify the section below to point to a pdf or your choice to visualize the results. By default, it points to one of the test .pdfs included with camelot.\n", "This is seeded with the unit test files for convenience." ] }, @@ -145,9 +130,7 @@ " timer_before_parse = time.perf_counter()\n", " error, tables = None, []\n", " try:\n", - " tables = pypdf_table_extraction.read_pdf(\n", - " filename, flavor=flavor, debug=True, **kwargs\n", - " )\n", + " tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n", " except ValueError as value_error:\n", " error = f\"Invalid argument for parser {flavor}: {value_error}\"\n", " print(error)\n", @@ -195,7 +178,7 @@ "# Set up plotting options\n", "import matplotlib.pyplot as plt\n", "\n", - "%matplotlib inline\n", + "# %matplotlib inline\n", "PLOT_HEIGHT = 12\n", "\n", "row_count = max(max_tables, 1)\n", @@ -260,7 +243,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.13.0" }, "mimetype": "text/x-python", "name": "python", From 91e2fa76b6d247ec12f5341acd6c990645a2aacd Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 30 Dec 2024 11:36:01 +0100 Subject: [PATCH 2/3] Remove 3.8 tests --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b2f1cabf..f34f281b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,7 +25,6 @@ jobs: - { python: "3.12", os: "ubuntu-latest", session: "tests" } - { python: "3.11", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "ubuntu-latest", session: "tests" } - - { python: "3.8", os: "ubuntu-latest", session: "tests" } - { python: "3.9", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "windows-latest", session: "tests" } - { python: "3.10", os: "macos-latest", session: "tests" } From bef80bf10c3de1dd5e9f115f144ea8ddd0127d39 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 30 Dec 2024 11:36:54 +0100 Subject: [PATCH 3/3] Update requires-python --- .github/workflows/tests.yml | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f34f281b..b2f1cabf 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,6 +25,7 @@ jobs: - { python: "3.12", os: "ubuntu-latest", session: "tests" } - { python: "3.11", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "ubuntu-latest", session: "tests" } + - { python: "3.8", os: "ubuntu-latest", session: "tests" } - { python: "3.9", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "windows-latest", session: "tests" } - { python: "3.10", os: "macos-latest", session: "tests" } diff --git a/pyproject.toml b/pyproject.toml index ff2d90f4..a2eee561 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" classifiers = [ "Development Status :: 5 - Production/Stable", ] -requires-python = ">=3.9" +requires-python = ">=3.8" dependencies = [ "click>=8.0.1", "chardet>=5.1.0",