From 286f2f7964db061f455641490d429a7a0dcf4622 Mon Sep 17 00:00:00 2001 From: Jerry Liu Date: Thu, 26 Oct 2023 08:51:21 -0700 Subject: [PATCH] Add longllmlingua (#8485) --- .../node_postprocessor/LongLLMLingua.ipynb | 4445 +++++++++++++++++ llama_index/indices/postprocessor/__init__.py | 2 + .../indices/postprocessor/longllmlingua.py | 112 + 3 files changed, 4559 insertions(+) create mode 100644 docs/examples/node_postprocessor/LongLLMLingua.ipynb create mode 100644 llama_index/indices/postprocessor/longllmlingua.py diff --git a/docs/examples/node_postprocessor/LongLLMLingua.ipynb b/docs/examples/node_postprocessor/LongLLMLingua.ipynb new file mode 100644 index 0000000000000..e325a32b323c0 --- /dev/null +++ b/docs/examples/node_postprocessor/LongLLMLingua.ipynb @@ -0,0 +1,4445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LongLLMLingua\n", + "\n", + "\"Open\n", + "\n", + "LongLLMLingua is a research project/paper that presents a new method for prompt compression in the long-context setting.\n", + "\n", + "- Paper: https://arxiv.org/abs/2310.06839\n", + "- Repo: https://github.com/microsoft/LLMLingua\n", + "\n", + "In this guide, we show how you can seamlessly use prompt compression in your RAG pipeline. We implement LongLLMLingua as a node postprocessor, which will compress context after the retrieval step before feeding it into the LLM.\n", + "\n", + "**NOTE**: we don't implement the [subsequence recovery method](https://github.com/microsoft/LLMLingua/blob/main/DOCUMENT.md#post-precessing) since that is after the step of processing the nodes.\n", + "\n", + "**NOTE**: You need quite a bit of RAM/GPU capacity to run this. We got it working on Colab Pro with a V100 instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install llmlingua llama-index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "openai.api_key = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup (Data + Index)\n", + "\n", + "We load in PG's essay, index it, and define a retriever." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget \"https://www.dropbox.com/s/f6bmb19xdg0xedm/paul_graham_essay.txt?dl=1\" -O paul_graham_essay.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import (\n", + " VectorStoreIndex,\n", + " SimpleDirectoryReader,\n", + " load_index_from_storage,\n", + " StorageContext,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader(\n", + " input_files=[\"paul_graham_essay.txt\"]\n", + ").load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index = VectorStoreIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever = index.as_retriever(similarity_top_k=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# query_str = \"What did the author do growing up?\"\n", + "# query_str = \"What did the author do during his time in YC?\"\n", + "query_str = \"Where did the author go for art school?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = retriever.retrieve(query_str)\n", + "print(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup LongLLMLingua as a Postprocessor\n", + "\n", + "We setup `LongLLMLinguaPostprocessor` which will use the `longllmlingua` package to run prompt compression.\n", + "\n", + "We specify a target token size of 300, and supply an instruction string.\n", + "\n", + "Special thanks to Huiqiang J. for the help with the parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "57f58855b3244325963cd7c9d7c0aa6c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00 str: + return "LongLLMLinguaPostprocessor" + + def postprocess_nodes( + self, + nodes: List[NodeWithScore], + query_bundle: Optional[QueryBundle] = None, + ) -> List[NodeWithScore]: + """Optimize a node text given the query by shortening the node text.""" + if query_bundle is None: + raise ValueError("Query bundle is required.") + context_texts = [n.get_content(metadata_mode=self.metadata_mode) for n in nodes] + # split by "\n\n" (recommended by LongLLMLingua authors) + new_context_texts = [ + c for context in context_texts for c in context.split("\n\n") + ] + + # You can use it this way, although the question-aware fine-grained compression hasn't been enabled. + compressed_prompt = self._llm_lingua.compress_prompt( + new_context_texts, # ! Replace the previous context_list + instruction=self.instruction_str, + question=query_bundle.query_str, + # target_token=2000, + target_token=self.target_token, + rank_method=self.rank_method, + **self.additional_compress_kwargs, + ) + + compressed_prompt_txt = compressed_prompt["compressed_prompt"] + + # separate out the question and instruction (appended to top and bottom) + compressed_prompt_txt_list = compressed_prompt_txt.split("\n\n") + compressed_prompt_txt_list = compressed_prompt_txt_list[1:-1] + + # return nodes for each list + return [ + NodeWithScore(node=TextNode(text=t)) for t in compressed_prompt_txt_list + ]