Update notebooks (#522)

camelot-dev · Dec 30, 2024 · fb334ca · fb334ca
2 parents 12687dc + bef80bf
commit fb334ca
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 75 deletions.
diff --git a/...ble_extraction_quick_start_notebook.ipynb → examples/camelot-quickstart-notebook.ipynb b/...ble_extraction_quick_start_notebook.ipynb → examples/camelot-quickstart-notebook.ipynb
@@ -8,9 +8,9 @@
    "source": [
     "# Quickstart example\n",
     "\n",
-    "This notebook shows you a quick way to get started with [pypdf_table_extraction](https://github.com/py-pdf/pypdf_table_extraction) .\n",
+    "This notebook shows you how to quickly get started with [camelot](https://github.com/camelot-dev/camelot) .\n",
     "\n",
-    "**Usage:** Either upload files or provide a PDF URL in the specified cells."
+    "**Usage:** Either upload PDFs or add a URL to a PDF in the specified cells."
    ]
   },
   {
@@ -21,8 +21,8 @@
    },
    "outputs": [],
    "source": [
-    "# @title 🛠️ Install [pypdf_table_extraction](https://github.com/py-pdf/pypdf_table_extraction)\n",
-    "!pip install pypdf-table-extraction\n",
+    "# @title 🛠️ Install [camelot](https://github.com/camelot-dev/camelot)\n",
+    "!pip install camelot-py\n",
     "# install tabulate (optional) only needed in this notebook for pretty display of results.\n",
     "!pip install tabulate"
    ]
@@ -38,11 +38,9 @@
     "\n",
     "sys.path.insert(0, os.path.abspath(\"\"))  # Prefer the local version if available\n",
     "\n",
-    "import pypdf_table_extraction\n",
+    "import camelot\n",
     "\n",
-    "print(\n",
-    "    f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n",
-    ")"
+    "print(f\"Using camelot v{camelot.__version__}.\")"
    ]
   },
   {
@@ -154,6 +152,7 @@
     "\n",
     "# import os\n",
     "import requests\n",
+    "\n",
     "# from pathlib import Path\n",
     "\n",
     "\n",
@@ -168,7 +167,7 @@
     "\n",
     "\n",
     "# Sample .pdf data from GitHub\n",
-    "pdf_url = \"https://github.com/py-pdf/pypdf_table_extraction/blob/main/docs/_static/pdf/foo.pdf\"  # @param {type:\"string\"}\n",
+    "pdf_url = \"https://github.com/camelot-dev/camelot/blob/main/docs/_static/pdf/foo.pdf\"  # @param {type:\"string\"}\n",
     "\n",
     "# Convert the GitHub URL to the raw content URL\n",
     "pdf_url = convert_github_url_to_raw(pdf_url)\n",
@@ -206,7 +205,6 @@
    },
    "outputs": [],
    "source": [
-    "# import pypdf_table_extraction\n",
     "import logging\n",
     "import pandas as pd\n",
     "\n",
@@ -235,9 +233,9 @@
     "        logging.error(f\"Failed to open PDF {pdf_file.name} with PdfReader: {e}\")\n",
     "        return\n",
     "\n",
-    "    # Read tables from the PDF using pypdf_table_extraction\n",
+    "    # Read tables from the PDF using camelot\n",
     "    try:\n",
-    "        tables = pypdf_table_extraction.read_pdf(str(pdf_file))\n",
+    "        tables = camelot.read_pdf(str(pdf_file))\n",
     "    except Exception as e:\n",
     "        print(f\"Failed to read PDF {pdf_file.name}: {e}\")\n",
     "        logging.error(f\"Failed to read PDF {pdf_file.name}: {e}\")\n",
@@ -316,7 +314,6 @@
    "source": [
     "# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output\n",
     "\n",
-    "# import pypdf_table_extraction\n",
     "# import os\n",
     "# from pathlib import Path\n",
     "import pandas as pd\n",
@@ -333,11 +330,11 @@
     "    print(f\"\\nProcessing {pdf_file.name}\")\n",
     "\n",
     "    # Using 'network' parsing method with table_areas\n",
-    "    tables_network = pypdf_table_extraction.read_pdf(str(pdf_file), flavor=\"network\")\n",
+    "    tables_network = camelot.read_pdf(str(pdf_file), flavor=\"network\")\n",
     "\n",
     "    if len(tables_network) == 0:\n",
     "        # If no tables are detected, try using 'lattice' parser\n",
-    "        tables_lattice = pypdf_table_extraction.read_pdf(\n",
+    "        tables_lattice = camelot.read_pdf(\n",
     "            str(pdf_file), flavor=\"lattice\", table_areas=[\"50,750,500,50\"]\n",
     "        )\n",
     "\n",
@@ -395,6 +392,7 @@
     "# @title 🗑️ Clear Input & Output Directory\n",
     "\n",
     "import shutil\n",
+    "\n",
     "# from pathlib import Path\n",
     "# import os\n",
     "\n",
@@ -447,7 +445,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.5"
+   "version": "3.13.0"
   }
  },
  "nbformat": 4,

diff --git a/examples/hybrid-parser-step-by-step.ipynb b/examples/hybrid-parser-step-by-step.ipynb
@@ -16,7 +16,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Setup pypdf_table_extraction"
+    "# Install camelot"
    ]
   },
   {
@@ -25,20 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "\n",
-    "os.getcwd()\n",
-    "# Install from source\n",
-    "!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n",
-    "%cd src\n",
-    "\n",
-    "\n",
-    "!pip install -e .\n",
-    "\n",
-    "# Optionally you can Install ghostscript as the imageconversion backend.\n",
-    "# uncomment the following lines\n",
-    "# !apt-get install -y ghostscript\n",
-    "# !pip install ghostscript"
+    "!pip install camelot-py"
    ]
   },
   {
@@ -47,7 +34,7 @@
    "source": [
     "# Select a PDF file to analyze\n",
     "\n",
-    "You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it.  By default, it points to one of the test .pdfs included with pypdf_table_extraction."
+    "You can modify the section below to point to a pdf or your choice to visualize how the algorithm analyzes it.  By default, it points to one of the test .pdfs included with camelot."
    ]
   },
   {
@@ -63,12 +50,10 @@
     "\n",
     "sys.path.insert(\n",
     "    0, os.path.abspath(\"\")\n",
-    ")  # Prefer the local version of pypdf_table_extraction if available\n",
-    "import pypdf_table_extraction\n",
+    ")  # Prefer the local version of camelot if available\n",
+    "import camelot\n",
     "\n",
-    "print(\n",
-    "    f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n",
-    ")\n",
+    "print(f\"Using camelot v{camelot.__version__}.\")\n",
     "\n",
     "# Select a pdf to analyze.\n",
     "kwargs = {}\n",
@@ -127,7 +112,7 @@
     "# Set up plotting options\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "%matplotlib inline\n",
+    "# %matplotlib inline\n",
     "PLOT_HEIGHT = 12\n",
     "\n",
     "\n",
@@ -182,12 +167,12 @@
     "# Parse file\n",
     "flavor = \"network\"\n",
     "timer_before_parse = time.perf_counter()\n",
-    "tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
+    "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
     "timer_after_parse = time.perf_counter()\n",
     "\n",
     "if tables is not None:\n",
     "    fig, ax = init_figure_and_axis(f\"Text elements in PDF\\n{pdf_file}\")\n",
-    "    pypdf_table_extraction.plot(tables[0], kind=\"text\", ax=ax)\n",
+    "    camelot.plot(tables[0], kind=\"text\", ax=ax)\n",
     "else:\n",
     "    print(\"No table found for this document.\")"
    ]
@@ -213,7 +198,7 @@
    "source": [
     "if tables is not None:\n",
     "    fig, ax = init_figure_and_axis(f\"Text edges in PDF\\n{pdf_file}\")\n",
-    "    pypdf_table_extraction.plot(tables[0], kind=\"textedge\", ax=ax)\n",
+    "    camelot.plot(tables[0], kind=\"textedge\", ax=ax)\n",
     "else:\n",
     "    print(f\"No table found for document {pdf_file}.\")"
    ]
@@ -243,7 +228,7 @@
    "source": [
     "if tables is not None:\n",
     "    fig, ax = init_figure_and_axis(f\"Growth steps for table in PDF\\n{pdf_file}\")\n",
-    "    pypdf_table_extraction.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
+    "    camelot.plot(tables[0], kind=\"network_table_search\", ax=ax)\n",
     "else:\n",
     "    print(\"No table found for this document.\")"
    ]
@@ -292,12 +277,12 @@
     "# Parse file\n",
     "flavor = \"lattice\"\n",
     "timer_before_parse = time.perf_counter()\n",
-    "tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
+    "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
     "timer_after_parse = time.perf_counter()\n",
     "\n",
     "if tables is not None:\n",
     "    fig, ax = init_figure_and_axis(f\"Line structure in PDF\\n{pdf_file}\")\n",
-    "    pypdf_table_extraction.plot(tables[0], kind=\"line\", ax=ax)\n",
+    "    camelot.plot(tables[0], kind=\"line\", ax=ax)\n",
     "else:\n",
     "    print(\"No table found for this document.\")"
    ]
@@ -319,7 +304,7 @@
    "source": [
     "for table in tables:\n",
     "    fig, ax = init_figure_and_axis(f\"Contour structure in PDF\\n{pdf_file}\")\n",
-    "    pypdf_table_extraction.plot(table, kind=\"contour\", ax=ax)"
+    "    camelot.plot(table, kind=\"contour\", ax=ax)"
    ]
   },
   {
@@ -339,7 +324,7 @@
    "source": [
     "for table in tables:\n",
     "    fig, ax = init_figure_and_axis(f\"Joint structure in PDF\\n{pdf_file}\")\n",
-    "    pypdf_table_extraction.plot(table, kind=\"joint\", ax=ax)"
+    "    camelot.plot(table, kind=\"joint\", ax=ax)"
    ]
   },
   {
@@ -392,7 +377,7 @@
    "source": [
     "flavor = \"hybrid\"\n",
     "timer_before_parse = time.perf_counter()\n",
-    "tables = pypdf_table_extraction.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
+    "tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
     "timer_after_parse = time.perf_counter()\n",
     "\n",
     "display_parse_results(tables, timer_after_parse - timer_before_parse, flavor)"
@@ -416,7 +401,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.5"
+   "version": "3.13.0"
   },
   "mimetype": "text/x-python",
   "name": "python",

diff --git a/examples/parser-comparison-notebook.ipynb b/examples/parser-comparison-notebook.ipynb
@@ -13,7 +13,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Setup pypdf_table_extraction"
+    "# Install camelot"
    ]
   },
   {
@@ -22,20 +22,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "\n",
-    "os.getcwd()\n",
-    "# Install from source\n",
-    "!git clone -b main https://github.com/py-pdf/pypdf_table_extraction.git src\n",
-    "%cd src\n",
-    "\n",
-    "\n",
-    "!pip install -e .\n",
-    "\n",
-    "# Optionally you can Install ghostscript as the imageconversion backend.\n",
-    "# uncomment the following lines\n",
-    "# !apt-get install -y ghostscript\n",
-    "# !pip install ghostscript"
+    "!pip install camelot-py"
    ]
   },
   {
@@ -51,12 +38,10 @@
     "\n",
     "sys.path.insert(\n",
     "    0, os.path.abspath(\"\")\n",
-    ")  # Prefer the local version of pypdf_table_extraction if available\n",
-    "import pypdf_table_extraction\n",
+    ")  # Prefer the local version of camelot if available\n",
+    "import camelot\n",
     "\n",
-    "print(\n",
-    "    f\"Using pypdf_table_extraction v{pypdf_table_extraction.__version__} from file {pypdf_table_extraction.__file__}.\"\n",
-    ")"
+    "print(f\"Using camelot v{camelot.__version__}.\")"
    ]
   },
   {
@@ -65,7 +50,7 @@
    "source": [
     "## Select a PDF file to review\n",
     "\n",
-    "You can modify the section below to point to a pdf or your choice to visualize the results.  By default, it points to one of the test .pdfs included with pypdf_table_extraction.\n",
+    "You can modify the section below to point to a pdf or your choice to visualize the results.  By default, it points to one of the test .pdfs included with camelot.\n",
     "This is seeded with the unit test files for convenience."
    ]
   },
@@ -145,9 +130,7 @@
     "    timer_before_parse = time.perf_counter()\n",
     "    error, tables = None, []\n",
     "    try:\n",
-    "        tables = pypdf_table_extraction.read_pdf(\n",
-    "            filename, flavor=flavor, debug=True, **kwargs\n",
-    "        )\n",
+    "        tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
     "    except ValueError as value_error:\n",
     "        error = f\"Invalid argument for parser {flavor}: {value_error}\"\n",
     "        print(error)\n",
@@ -195,7 +178,7 @@
     "# Set up plotting options\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "%matplotlib inline\n",
+    "# %matplotlib inline\n",
     "PLOT_HEIGHT = 12\n",
     "\n",
     "row_count = max(max_tables, 1)\n",
@@ -260,7 +243,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.5"
+   "version": "3.13.0"
   },
   "mimetype": "text/x-python",
   "name": "python",

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 classifiers = [
     "Development Status :: 5 - Production/Stable",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 dependencies = [
     "click>=8.0.1",
     "chardet>=5.1.0",