From c998569c8f5c4e080509283123175769fc7568ae Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Wed, 17 May 2023 21:33:34 -0700 Subject: [PATCH] docs: text splitters improvements (#4490) #docs: text splitters improvements Changes are only in the Jupyter notebooks. - added links to the source packages and a short description of these packages - removed " Text Splitters" suffixes from the TOC elements (they made the list of the text splitters messy) - moved text splitters, based on the length function into a separate list. They can be mixed with any classes from the "Text Splitters", so it is a different classification. ## Who can review? @hwchase17 - project lead @eyurtsev @vowelparrot NOTE: please, check out the results of the `Python code` text splitter example (text_splitters/examples/python.ipynb). It looks suboptimal. --- docs/modules/indexes/text_splitters.rst | 43 ++++++++++- .../examples/character_text_splitter.ipynb | 47 +++++++++-- .../huggingface_length_function.ipynb | 11 ++- .../text_splitters/examples/latex.ipynb | 58 +++++++++++--- .../text_splitters/examples/markdown.ipynb | 77 +++++++++++++++---- .../text_splitters/examples/nltk.ipynb | 23 ++++-- .../text_splitters/examples/python.ipynb | 49 ++++++++++-- .../examples/recursive_text_splitter.ipynb | 55 +++++++++++-- .../text_splitters/examples/spacy.ipynb | 23 ++++-- .../text_splitters/examples/tiktoken.ipynb | 20 ++++- .../examples/tiktoken_splitter.ipynb | 23 ++++-- 11 files changed, 356 insertions(+), 73 deletions(-) diff --git a/docs/modules/indexes/text_splitters.rst b/docs/modules/indexes/text_splitters.rst index 57faa37ef198a..9b8b66fb21b5a 100644 --- a/docs/modules/indexes/text_splitters.rst +++ b/docs/modules/indexes/text_splitters.rst @@ -30,12 +30,47 @@ For an introduction to the default text splitter and generic functionality see: ./text_splitters/getting_started.ipynb -We also have documentation for all the types of text splitters that are supported. -Please see below for that list. +Usage examples for the text splitters: + +- `Character <./text_splitters/examples/character_text_splitter.html>`_ +- `LaTeX <./text_splitters/examples/latex.html>`_ +- `Markdown <./text_splitters/examples/markdown.html>`_ +- `NLTK <./text_splitters/examples/nltk.html>`_ +- `Python code <./text_splitters/examples/python.html>`_ +- `Recursive Character <./text_splitters/examples/recursive_text_splitter.html>`_ +- `spaCy <./text_splitters/examples/spacy.html>`_ +- `tiktoken (OpenAI) <./text_splitters/examples/tiktoken_splitter.html>`_ .. toctree:: :maxdepth: 1 - :glob: + :caption: Text Splitters + :name: text_splitters + :hidden: + + ./text_splitters/examples/character_text_splitter.ipynb + ./text_splitters/examples/latex.ipynb + ./text_splitters/examples/markdown.ipynb + ./text_splitters/examples/nltk.ipynb + ./text_splitters/examples/python.ipynb + ./text_splitters/examples/recursive_text_splitter.ipynb + ./text_splitters/examples/spacy.ipynb + ./text_splitters/examples/tiktoken_splitter.ipynb + + +Most LLMs are constrained by the number of tokens that you can pass in, which is not the same as the number of characters. +In order to get a more accurate estimate, we can use tokenizers to count the number of tokens in the text. +We use this number inside the `..TextSplitter` classes. +This implemented as the `from_` methods of the `..TextSplitter` classes: + +- `Hugging Face tokenizer <./text_splitters/examples/huggingface_length_function.html>`_ +- `tiktoken (OpenAI) tokenizer <./text_splitters/examples/tiktoken.html>`_ + +.. toctree:: + :maxdepth: 1 + :caption: Text Splitters with Tokens + :name: text_splitter_with_tokens + :hidden: - ./text_splitters/examples/* + ./text_splitters/examples/huggingface_length_function.ipynb + ./text_splitters/examples/tiktoken.ipynb \ No newline at end of file diff --git a/docs/modules/indexes/text_splitters/examples/character_text_splitter.ipynb b/docs/modules/indexes/text_splitters/examples/character_text_splitter.ipynb index 0f6ccf0149618..c1a0a913df2e6 100644 --- a/docs/modules/indexes/text_splitters/examples/character_text_splitter.ipynb +++ b/docs/modules/indexes/text_splitters/examples/character_text_splitter.ipynb @@ -5,19 +5,21 @@ "id": "5c461b26", "metadata": {}, "source": [ - "# Character Text Splitter\n", + "# Character\n", "\n", - "This is a more simple method. This splits based on characters (by default \"\\n\\n\") and measure chunk length by number of characters.\n", + "This is the simplest method. This splits based on characters (by default \"\\n\\n\") and measure chunk length by number of characters.\n", "\n", "1. How the text is split: by single character\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + "2. How the chunk size is measured: by number of characters" ] }, { "cell_type": "code", "execution_count": 1, "id": "9c21e679", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# This is a long document we can split up.\n", @@ -29,7 +31,9 @@ "cell_type": "code", "execution_count": 2, "id": "79ff6737", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from langchain.text_splitter import CharacterTextSplitter\n", @@ -87,6 +91,37 @@ "documents = text_splitter.create_documents([state_of_the_union, state_of_the_union], metadatas=metadatas)\n", "print(documents[0])" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90ac0381-855a-469a-b8bf-e33230132bbe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_splitter.split_text(state_of_the_union)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "875c20be-9f63-4aee-b05a-34e9c04c1091", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -105,7 +140,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/huggingface_length_function.ipynb b/docs/modules/indexes/text_splitters/examples/huggingface_length_function.ipynb index 32c0fb027d60d..bbea143c48987 100644 --- a/docs/modules/indexes/text_splitters/examples/huggingface_length_function.ipynb +++ b/docs/modules/indexes/text_splitters/examples/huggingface_length_function.ipynb @@ -5,11 +5,14 @@ "id": "13dc0983", "metadata": {}, "source": [ - "# Hugging Face Length Function\n", - "Most LLMs are constrained by the number of tokens that you can pass in, which is not the same as the number of characters. In order to get a more accurate estimate, we can use Hugging Face tokenizers to count the text length.\n", + "# Hugging Face tokenizer\n", + "\n", + ">[Hugging Face](https://huggingface.co/docs/tokenizers/index) has many tokenizers.\n", + "\n", + "We use Hugging Face tokenizer, the [GPT2TokenizerFast](https://huggingface.co/Ransaka/gpt2-tokenizer-fast) to count the text length in tokens.\n", "\n", "1. How the text is split: by character passed in\n", - "2. How the chunk size is measured: by Hugging Face tokenizer" + "2. How the chunk size is measured: by number of tokens calculated by the `Hugging Face` tokenizer\n" ] }, { @@ -89,7 +92,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/latex.ipynb b/docs/modules/indexes/text_splitters/examples/latex.ipynb index 596dd8c114941..aaf1f57526ec3 100644 --- a/docs/modules/indexes/text_splitters/examples/latex.ipynb +++ b/docs/modules/indexes/text_splitters/examples/latex.ipynb @@ -5,19 +5,23 @@ "id": "3a2f572e", "metadata": {}, "source": [ - "# Latex Text Splitter\n", + "# LaTeX\n", "\n", - "LatexTextSplitter splits text along Latex headings, headlines, enumerations and more. It's implemented as a simple subclass of RecursiveCharacterSplitter with Latex-specific separators. See the source code to see the Latex syntax expected by default.\n", + ">[LaTeX](https://en.wikipedia.org/wiki/LaTeX) is widely used in academia for the communication and publication of scientific documents in many fields, including mathematics, computer science, engineering, physics, chemistry, economics, linguistics, quantitative psychology, philosophy, and political science.\n", "\n", - "1. How the text is split: by list of latex specific tags\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + "`LatexTextSplitter` splits text along `LaTeX` headings, headlines, enumerations and more. It's implemented as a subclass of `RecursiveCharacterSplitter` with LaTeX-specific separators. See the source code for more details.\n", + "\n", + "1. How the text is split: by list of `LaTeX` specific tags\n", + "2. How the chunk size is measured: by number of characters" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "c2503917", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from langchain.text_splitter import LatexTextSplitter" @@ -25,9 +29,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "e46b753b", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "latex_text = \"\"\"\n", @@ -84,6 +90,40 @@ "source": [ "docs" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "40e62829-9485-414e-9ea1-e1a8fc7c88cb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle',\n", + " 'Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.',\n", + " 'History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.',\n", + " 'Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "latex_splitter.split_text(latex_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7deb8f25-a062-4956-9f90-513802069667", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -102,7 +142,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/markdown.ipynb b/docs/modules/indexes/text_splitters/examples/markdown.ipynb index adcc099f043fc..1c784e8bc7a0c 100644 --- a/docs/modules/indexes/text_splitters/examples/markdown.ipynb +++ b/docs/modules/indexes/text_splitters/examples/markdown.ipynb @@ -5,19 +5,23 @@ "id": "80f6cd99", "metadata": {}, "source": [ - "# Markdown Text Splitter\n", + "# Markdown\n", "\n", - "MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default.\n", + ">[Markdown](https://en.wikipedia.org/wiki/Markdown) is a lightweight markup language for creating formatted text using a plain-text editor.\n", "\n", - "1. How the text is split: by list of markdown specific characters\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + "`MarkdownTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Markdown-specific separators. See the source code to see the Markdown syntax expected by default.\n", + "\n", + "1. How the text is split: by list of `markdown` specific separators\n", + "2. How the chunk size is measured: by number of characters" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "96d64839", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from langchain.text_splitter import MarkdownTextSplitter" @@ -25,9 +29,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "cfb0da17", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "markdown_text = \"\"\"\n", @@ -49,9 +55,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "d59a4fe8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "docs = markdown_splitter.create_documents([markdown_text])" @@ -59,19 +67,21 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "cbb2e100", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n", - " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n", - " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]" + "[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', metadata={}),\n", + " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", metadata={}),\n", + " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', metadata={})]" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -79,6 +89,39 @@ "source": [ "docs" ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡',\n", + " \"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\",\n", + " 'As an open source project in a rapidly developing field, we are extremely open to contributions.']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "markdown_splitter.split_text(markdown_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bee7858-9175-4d99-bd30-68f2dece8601", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -97,7 +140,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/nltk.ipynb b/docs/modules/indexes/text_splitters/examples/nltk.ipynb index bdf084a650041..5285e5fc9f8cc 100644 --- a/docs/modules/indexes/text_splitters/examples/nltk.ipynb +++ b/docs/modules/indexes/text_splitters/examples/nltk.ipynb @@ -5,11 +5,24 @@ "id": "ea2973ac", "metadata": {}, "source": [ - "# NLTK Text Splitter\n", - "Rather than just splitting on \"\\n\\n\", we can use NLTK to split based on tokenizers.\n", + "# NLTK\n", "\n", - "1. How the text is split: by NLTK\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + ">[The Natural Language Toolkit](https://en.wikipedia.org/wiki/Natural_Language_Toolkit), or more commonly [NLTK](https://www.nltk.org/), is a suite of libraries and programs for symbolic and statistical natural language processing (NLP) for English written in the Python programming language.\n", + "\n", + "Rather than just splitting on \"\\n\\n\", we can use `NLTK` to split based on [NLTK tokenizers](https://www.nltk.org/api/nltk.tokenize.html).\n", + "\n", + "1. How the text is split: by `NLTK` tokenizer.\n", + "2. How the chunk size is measured:by number of characters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6af9886-7d53-4aab-84f6-303c4cce7882", + "metadata": {}, + "outputs": [], + "source": [ + "#pip install nltk" ] }, { @@ -103,7 +116,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/python.ipynb b/docs/modules/indexes/text_splitters/examples/python.ipynb index 7dddfc8f4902a..a184bcd5eb52a 100644 --- a/docs/modules/indexes/text_splitters/examples/python.ipynb +++ b/docs/modules/indexes/text_splitters/examples/python.ipynb @@ -5,19 +5,21 @@ "id": "c350765d", "metadata": {}, "source": [ - "# Python Code Text Splitter\n", + "# Python Code\n", "\n", - "PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default.\n", + "`PythonCodeTextSplitter` splits text along python class and method definitions. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Python-specific separators. See the source code to see the Python syntax expected by default.\n", "\n", - "1. How the text is split: by list of python specific characters\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + "1. How the text is split: by list of python specific separators\n", + "2. How the chunk size is measured: by number of characters" ] }, { "cell_type": "code", "execution_count": 1, "id": "1703463f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from langchain.text_splitter import PythonCodeTextSplitter" @@ -27,7 +29,9 @@ "cell_type": "code", "execution_count": 2, "id": "f17a1854", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "python_text = \"\"\"\n", @@ -77,6 +81,37 @@ "source": [ "docs" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "de625e08-c440-489d-beed-020b6c53bf69", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Foo:\\n\\n def bar():', 'foo():\\n\\ndef testing_func():', 'bar():']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "python_splitter.split_text(python_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55aadd84-75ca-48ae-9b84-b39c368488ed", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -95,7 +130,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/recursive_text_splitter.ipynb b/docs/modules/indexes/text_splitters/examples/recursive_text_splitter.ipynb index 3d26cfe01d06e..0d4eac5a6b54b 100644 --- a/docs/modules/indexes/text_splitters/examples/recursive_text_splitter.ipynb +++ b/docs/modules/indexes/text_splitters/examples/recursive_text_splitter.ipynb @@ -5,19 +5,22 @@ "id": "072eee66", "metadata": {}, "source": [ - "# RecursiveCharacterTextSplitter\n", + "# Recursive Character\n", + "\n", "This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is `[\"\\n\\n\", \"\\n\", \" \", \"\"]`. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.\n", "\n", "\n", "1. How the text is split: by list of characters\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + "2. How the chunk size is measured: by number of characters" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "d887c134", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# This is a long document we can split up.\n", @@ -27,9 +30,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "14662639", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter" @@ -39,7 +44,9 @@ "cell_type": "code", "execution_count": 4, "id": "fc6e42c8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "text_splitter = RecursiveCharacterTextSplitter(\n", @@ -70,6 +77,38 @@ "print(texts[0])\n", "print(texts[1])" ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ca35212d-634c-4679-9042-19c294a3c815", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and',\n", + " 'of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_splitter.split_text(state_of_the_union)[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b019a56a-7ba5-479d-b696-32188e4bc433", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -88,7 +127,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/spacy.ipynb b/docs/modules/indexes/text_splitters/examples/spacy.ipynb index ba442723defe8..6fa2578910945 100644 --- a/docs/modules/indexes/text_splitters/examples/spacy.ipynb +++ b/docs/modules/indexes/text_splitters/examples/spacy.ipynb @@ -5,11 +5,24 @@ "id": "dab86b60", "metadata": {}, "source": [ - "# Spacy Text Splitter\n", - "Another alternative to NLTK is to use Spacy.\n", + "# spaCy\n", "\n", - "1. How the text is split: by Spacy\n", - "2. How the chunk size is measured: by length function passed in (defaults to number of characters)" + ">[spaCy](https://spacy.io/) is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython.\n", + "\n", + "Another alternative to `NLTK` is to use [Spacy tokenizer](https://spacy.io/api/tokenizer).\n", + "\n", + "1. How the text is split: by `spaCy` tokenizer\n", + "2. How the chunk size is measured: by number of characters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0b9242f-690c-4819-b35a-bb68187281ed", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install spacy" ] }, { @@ -125,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/tiktoken.ipynb b/docs/modules/indexes/text_splitters/examples/tiktoken.ipynb index eb8cf2a6e28c8..531c8d3dbb2d1 100644 --- a/docs/modules/indexes/text_splitters/examples/tiktoken.ipynb +++ b/docs/modules/indexes/text_splitters/examples/tiktoken.ipynb @@ -5,13 +5,27 @@ "id": "7683b36a", "metadata": {}, "source": [ - "# tiktoken (OpenAI) Length Function\n", - "You can also use tiktoken, an open source tokenizer package from OpenAI to estimate tokens used. Will probably be more accurate for their models.\n", + "# tiktoken (OpenAI) tokenizer\n", + "\n", + ">[tiktoken](https://github.com/openai/tiktoken) is a fast `BPE` tokenizer created by `OpenAI`.\n", + "\n", + "\n", + "We can use it to estimate tokens used. It will probably be more accurate for the OpenAI models.\n", "\n", "1. How the text is split: by character passed in\n", "2. How the chunk size is measured: by `tiktoken` tokenizer" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c4ef83e-f43a-4658-ad1a-3952e0a5bbe7", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install tiktoken" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -77,7 +91,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": { diff --git a/docs/modules/indexes/text_splitters/examples/tiktoken_splitter.ipynb b/docs/modules/indexes/text_splitters/examples/tiktoken_splitter.ipynb index a3ad0a1af2fe2..2781ebc4b712e 100644 --- a/docs/modules/indexes/text_splitters/examples/tiktoken_splitter.ipynb +++ b/docs/modules/indexes/text_splitters/examples/tiktoken_splitter.ipynb @@ -5,12 +5,27 @@ "id": "53049ff5", "metadata": {}, "source": [ - "# TiktokenText Splitter\n", + "# Tiktoken\n", + "\n", + ">[tiktoken](https://github.com/openai/tiktoken) is a fast `BPE` tokeniser created by `OpenAI`.\n", + "\n", "\n", "1. How the text is split: by `tiktoken` tokens\n", "2. How the chunk size is measured: by `tiktoken` tokens" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e6e8223b-7e93-4220-8b22-27aea5cf3f56", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!pip install tiktoken" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -47,9 +62,7 @@ "cell_type": "code", "execution_count": 6, "id": "5750228a", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -89,7 +102,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" }, "vscode": { "interpreter": {