From 626311bf7301c33ee49faea24af80c1962270d77 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:38:37 +0100 Subject: [PATCH] docs: add advanced chunking example Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/examples/advanced_chunking.ipynb | 211 ++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 docs/examples/advanced_chunking.ipynb diff --git a/docs/examples/advanced_chunking.ipynb b/docs/examples/advanced_chunking.ipynb new file mode 100644 index 00000000..809d3e9c --- /dev/null +++ b/docs/examples/advanced_chunking.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced Chunking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we demonstrate an advanced chunking example, showcasing how a user can:\n", + "- serialize and include some parts of the metadata (as per application logic) into the final chunk text, and\n", + "- leverage a tokenizer to build specialized chunking logic, e.g. to impose a maximum token length and futher split chunks beyond that." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We first convert an example document:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "\n", + "source = \"https://arxiv.org/pdf/2408.09869\"\n", + "converter = DocumentConverter()\n", + "doc = converter.convert(source=source).document" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we define the metadata serialization logic and the specific usage of the tokenizer for applying the token limits.\n", + "\n", + "The whole process is wrapped as a `BaseChunker` implementation internally using a `HierarchicalChunker` and applying the logic on top of the results of the latter." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "from typing import Iterable, Iterator\n", + "\n", + "from docling_core.transforms.chunker import (\n", + " BaseChunk,\n", + " BaseChunker,\n", + " DocMeta,\n", + " HierarchicalChunker,\n", + ")\n", + "from docling_core.types.doc import DoclingDocument as DLDocument\n", + "from pydantic import ConfigDict, PositiveInt\n", + "from transformers import AutoTokenizer\n", + "\n", + "\n", + "class MaxTokenLimitingChunker(BaseChunker):\n", + " model_config = ConfigDict(arbitrary_types_allowed=True)\n", + "\n", + " inner_chunker: BaseChunker = HierarchicalChunker()\n", + " tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-small-en-v1.5\")\n", + " max_tokens: PositiveInt = 512\n", + " delim: str = \"\\n\"\n", + "\n", + " def _serialize_meta_to_include(self, meta: DocMeta) -> str:\n", + " meta_parts = []\n", + " headings_part = self.delim.join(meta.headings or [])\n", + " if headings_part:\n", + " meta_parts.append(headings_part)\n", + " captions_part = self.delim.join(meta.captions or [])\n", + " if captions_part:\n", + " meta_parts.append(captions_part)\n", + " return self.delim.join(meta_parts)\n", + "\n", + " def _split_above_max_tokens(self, chunk_iter: Iterable[BaseChunk]):\n", + " for chunk in chunk_iter:\n", + " meta = DocMeta.model_validate(chunk.meta)\n", + " meta_text = self._serialize_meta_to_include(meta=meta)\n", + " meta_list = [meta_text] if meta_text else []\n", + " full_ser = self.delim.join(meta_list + ([chunk.text] if chunk.text else []))\n", + "\n", + " meta_tokens = self.tokenizer(\n", + " meta_text, return_offsets_mapping=True, add_special_tokens=False\n", + " )[\"offset_mapping\"]\n", + " delim_tokens = (\n", + " self.tokenizer(\n", + " self.delim, return_offsets_mapping=True, add_special_tokens=False\n", + " )[\"offset_mapping\"]\n", + " if meta_text\n", + " else []\n", + " )\n", + " num_tokens_avail_for_text = self.max_tokens - (\n", + " len(meta_tokens) + len(delim_tokens)\n", + " )\n", + "\n", + " text_tokens = self.tokenizer(\n", + " chunk.text, return_offsets_mapping=True, add_special_tokens=False\n", + " )[\"offset_mapping\"]\n", + " num_text_tokens = len(text_tokens)\n", + "\n", + " if (\n", + " num_text_tokens <= num_tokens_avail_for_text\n", + " ): # chunk already within token limit\n", + " c = deepcopy(chunk)\n", + " c.text = full_ser\n", + " yield c\n", + " else: # chunk requires further splitting to meet token limit\n", + " fitting_texts = [\n", + " chunk.text[\n", + " text_tokens[base][0] : text_tokens[\n", + " min(base + num_tokens_avail_for_text, num_text_tokens) - 1\n", + " ][1]\n", + " ]\n", + " for base in range(0, num_text_tokens, num_tokens_avail_for_text)\n", + " ]\n", + " for text in fitting_texts:\n", + " c = deepcopy(chunk)\n", + " c.text = self.delim.join(meta_list + [text])\n", + " yield c\n", + "\n", + " def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:\n", + " chunk_iter = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n", + " yield from self._split_above_max_tokens(chunk_iter=chunk_iter)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the example invocation shown below, one can see how a single original chunk (`self_ref == \"#/texts/8\"`) is split into multiple ones:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'len=64 text=1 Introduction\\nConverting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation ('" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'len=64 text=1 Introduction\\nRAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'len=26 text=1 Introduction\\n, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunker = MaxTokenLimitingChunker(max_tokens=64)\n", + "chunk_iter = chunker.chunk(dl_doc=doc)\n", + "\n", + "for chunk in chunk_iter:\n", + " meta = DocMeta.model_validate(chunk.meta)\n", + " if meta.doc_items[0].self_ref == \"#/texts/8\":\n", + " display(\n", + " f\"len={len(chunker.tokenizer(chunk.text, return_offsets_mapping=True, add_special_tokens=False)['offset_mapping'])} text={chunk.text}\"\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}