From bae14c8f13c4ff66b0d266ad1ea44ca97413258f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 7 Mar 2024 08:50:24 -0800
Subject: [PATCH 01/17] Right-truncate long chat completion prompts instead of
 left-truncating

Instructions are usually at the beginning of the prompt.
---
 modules/chat.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 237e827596..a1fcb6b0ca 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -197,16 +197,16 @@ def make_prompt(messages):
                 while right - left > 1:
                     mid = (left + right) // 2
 
-                    messages[-1]['content'] = user_message[mid:]
+                    messages[-1]['content'] = user_message[:mid]
                     prompt = make_prompt(messages)
                     encoded_length = get_encoded_length(prompt)
 
                     if encoded_length <= max_length:
-                        right = mid
-                    else:
                         left = mid
+                    else:
+                        right = mid
 
-                messages[-1]['content'] = user_message[right:]
+                messages[-1]['content'] = user_message[:left]
                 prompt = make_prompt(messages)
                 encoded_length = get_encoded_length(prompt)
                 if encoded_length > max_length:

From 2681f6f64049c48f43d3371072fe7ee0e101f6dd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 7 Mar 2024 15:03:18 -0300
Subject: [PATCH 02/17] Make superbooga & superboogav2 functional again (#5656)

---
 extensions/superbooga/chromadb.py            |  52 ++-----
 extensions/superbooga/requirements.txt       |   2 +-
 extensions/superboogav2/api.py               |  33 ++---
 extensions/superboogav2/benchmark.py         |  14 +-
 extensions/superboogav2/chat_handler.py      |  31 ++--
 extensions/superboogav2/chromadb.py          | 142 ++++++++-----------
 extensions/superboogav2/data_preprocessor.py |  40 +++---
 extensions/superboogav2/data_processor.py    |  25 ++--
 extensions/superboogav2/download_urls.py     |   5 +-
 extensions/superboogav2/notebook_handler.py  |   7 +-
 extensions/superboogav2/optimize.py          |  26 ++--
 extensions/superboogav2/parameters.py        |  12 +-
 extensions/superboogav2/requirements.txt     |   4 +-
 extensions/superboogav2/script.py            |  42 +++---
 extensions/superboogav2/utils.py             |   3 +-
 15 files changed, 185 insertions(+), 253 deletions(-)

diff --git a/extensions/superbooga/chromadb.py b/extensions/superbooga/chromadb.py
index 1fb7a71848..b16158e10b 100644
--- a/extensions/superbooga/chromadb.py
+++ b/extensions/superbooga/chromadb.py
@@ -1,43 +1,24 @@
+import random
+
 import chromadb
 import posthog
-import torch
 from chromadb.config import Settings
-from sentence_transformers import SentenceTransformer
-
-from modules.logging_colors import logger
+from chromadb.utils import embedding_functions
 
-logger.info('Intercepting all calls to posthog :)')
+# Intercept calls to posthog
 posthog.capture = lambda *args, **kwargs: None
 
 
-class Collecter():
-    def __init__(self):
-        pass
-
-    def add(self, texts: list[str]):
-        pass
-
-    def get(self, search_strings: list[str], n_results: int) -> list[str]:
-        pass
+embedder = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-mpnet-base-v2")
 
-    def clear(self):
-        pass
 
-
-class Embedder():
+class ChromaCollector():
     def __init__(self):
-        pass
-
-    def embed(self, text: str) -> list[torch.Tensor]:
-        pass
+        name = ''.join(random.choice('ab') for _ in range(10))
 
-
-class ChromaCollector(Collecter):
-    def __init__(self, embedder: Embedder):
-        super().__init__()
+        self.name = name
         self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
-        self.embedder = embedder
-        self.collection = self.chroma_client.create_collection(name="context", embedding_function=embedder.embed)
+        self.collection = self.chroma_client.create_collection(name=name, embedding_function=embedder)
         self.ids = []
 
     def add(self, texts: list[str]):
@@ -102,24 +83,15 @@ def get_ids_sorted(self, search_strings: list[str], n_results: int, n_initial: i
         return sorted(ids)
 
     def clear(self):
-        self.collection.delete(ids=self.ids)
         self.ids = []
-
-
-class SentenceTransformerEmbedder(Embedder):
-    def __init__(self) -> None:
-        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
-        self.embed = self.model.encode
+        self.chroma_client.delete_collection(name=self.name)
+        self.collection = self.chroma_client.create_collection(name=self.name, embedding_function=embedder)
 
 
 def make_collector():
-    global embedder
-    return ChromaCollector(embedder)
+    return ChromaCollector()
 
 
 def add_chunks_to_collector(chunks, collector):
     collector.clear()
     collector.add(chunks)
-
-
-embedder = SentenceTransformerEmbedder()
diff --git a/extensions/superbooga/requirements.txt b/extensions/superbooga/requirements.txt
index 73a60078cf..4b16656875 100644
--- a/extensions/superbooga/requirements.txt
+++ b/extensions/superbooga/requirements.txt
@@ -1,5 +1,5 @@
 beautifulsoup4==4.12.2
-chromadb==0.3.18
+chromadb==0.4.24
 pandas==2.0.3
 posthog==2.4.2
 sentence_transformers==2.2.2
diff --git a/extensions/superboogav2/api.py b/extensions/superboogav2/api.py
index 993e2b7d6d..552c1c2cfa 100644
--- a/extensions/superboogav2/api.py
+++ b/extensions/superboogav2/api.py
@@ -12,17 +12,16 @@
 
 import json
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
-from urllib.parse import urlparse, parse_qs
 from threading import Thread
+from urllib.parse import parse_qs, urlparse
 
+import extensions.superboogav2.parameters as parameters
 from modules import shared
 from modules.logging_colors import logger
 
 from .chromadb import ChromaCollector
 from .data_processor import process_and_add_to_collector
 
-import extensions.superboogav2.parameters as parameters
-
 
 class CustomThreadingHTTPServer(ThreadingHTTPServer):
     def __init__(self, server_address, RequestHandlerClass, collector: ChromaCollector, bind_and_activate=True):
@@ -38,7 +37,6 @@ def __init__(self, request, client_address, server, collector: ChromaCollector):
         self.collector = collector
         super().__init__(request, client_address, server)
 
-
     def _send_412_error(self, message):
         self.send_response(412)
         self.send_header("Content-type", "application/json")
@@ -46,7 +44,6 @@ def _send_412_error(self, message):
         response = json.dumps({"error": message})
         self.wfile.write(response.encode('utf-8'))
 
-
     def _send_404_error(self):
         self.send_response(404)
         self.send_header("Content-type", "application/json")
@@ -54,14 +51,12 @@ def _send_404_error(self):
         response = json.dumps({"error": "Resource not found"})
         self.wfile.write(response.encode('utf-8'))
 
-
     def _send_400_error(self, error_message: str):
         self.send_response(400)
         self.send_header("Content-type", "application/json")
         self.end_headers()
         response = json.dumps({"error": error_message})
         self.wfile.write(response.encode('utf-8'))
-        
 
     def _send_200_response(self, message: str):
         self.send_response(200)
@@ -75,24 +70,21 @@ def _send_200_response(self, message: str):
 
         self.wfile.write(response.encode('utf-8'))
 
-
     def _handle_get(self, search_strings: list[str], n_results: int, max_token_count: int, sort_param: str):
         if sort_param == parameters.SORT_DISTANCE:
             results = self.collector.get_sorted_by_dist(search_strings, n_results, max_token_count)
         elif sort_param == parameters.SORT_ID:
             results = self.collector.get_sorted_by_id(search_strings, n_results, max_token_count)
-        else: # Default is dist
+        else:  # Default is dist
             results = self.collector.get_sorted_by_dist(search_strings, n_results, max_token_count)
-        
+
         return {
             "results": results
         }
 
-        
     def do_GET(self):
         self._send_404_error()
 
-
     def do_POST(self):
         try:
             content_length = int(self.headers['Content-Length'])
@@ -107,7 +99,7 @@ def do_POST(self):
                 if corpus is None:
                     self._send_412_error("Missing parameter 'corpus'")
                     return
-                
+
                 clear_before_adding = body.get('clear_before_adding', False)
                 metadata = body.get('metadata')
                 process_and_add_to_collector(corpus, self.collector, clear_before_adding, metadata)
@@ -118,7 +110,7 @@ def do_POST(self):
                 if corpus is None:
                     self._send_412_error("Missing parameter 'metadata'")
                     return
-                
+
                 self.collector.delete(ids_to_delete=None, where=metadata)
                 self._send_200_response("Data successfully deleted")
 
@@ -127,15 +119,15 @@ def do_POST(self):
                 if search_strings is None:
                     self._send_412_error("Missing parameter 'search_strings'")
                     return
-                
+
                 n_results = body.get('n_results')
                 if n_results is None:
                     n_results = parameters.get_chunk_count()
-                
+
                 max_token_count = body.get('max_token_count')
                 if max_token_count is None:
                     max_token_count = parameters.get_max_token_count()
-                
+
                 sort_param = query_params.get('sort', ['distance'])[0]
 
                 results = self._handle_get(search_strings, n_results, max_token_count, sort_param)
@@ -146,7 +138,6 @@ def do_POST(self):
         except Exception as e:
             self._send_400_error(str(e))
 
-
     def do_DELETE(self):
         try:
             parsed_path = urlparse(self.path)
@@ -161,12 +152,10 @@ def do_DELETE(self):
         except Exception as e:
             self._send_400_error(str(e))
 
-
     def do_OPTIONS(self):
         self.send_response(200)
         self.end_headers()
 
-
     def end_headers(self):
         self.send_header('Access-Control-Allow-Origin', '*')
         self.send_header('Access-Control-Allow-Methods', '*')
@@ -197,11 +186,11 @@ def start_server(self, port: int):
 
     def stop_server(self):
         if self.server is not None:
-            logger.info(f'Stopping chromaDB API.')
+            logger.info('Stopping chromaDB API.')
             self.server.shutdown()
             self.server.server_close()
             self.server = None
             self.is_running = False
 
     def is_server_running(self):
-        return self.is_running
\ No newline at end of file
+        return self.is_running
diff --git a/extensions/superboogav2/benchmark.py b/extensions/superboogav2/benchmark.py
index 46475a088b..5d9331a776 100644
--- a/extensions/superboogav2/benchmark.py
+++ b/extensions/superboogav2/benchmark.py
@@ -9,23 +9,23 @@
 import datetime
 import json
 import os
-
 from pathlib import Path
 
-from .data_processor import process_and_add_to_collector, preprocess_text
+from .data_processor import preprocess_text, process_and_add_to_collector
 from .parameters import get_chunk_count, get_max_token_count
 from .utils import create_metadata_source
 
+
 def benchmark(config_path, collector):
     # Get the current system date
     sysdate = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     filename = f"benchmark_{sysdate}.txt"
-    
+
     # Open the log file in append mode
     with open(filename, 'a') as log:
         with open(config_path, 'r') as f:
             data = json.load(f)
-        
+
         total_points = 0
         max_points = 0
 
@@ -45,7 +45,7 @@ def benchmark(config_path, collector):
             for question_group in item["questions"]:
                 question_variants = question_group["question_variants"]
                 criteria = question_group["criteria"]
-                
+
                 for q in question_variants:
                     max_points += len(criteria)
                     processed_text = preprocess_text(q)
@@ -54,7 +54,7 @@ def benchmark(config_path, collector):
                     results = collector.get_sorted_by_dist(processed_text, n_results=get_chunk_count(), max_token_count=get_max_token_count())
 
                     points = 0
-                    
+
                     for c in criteria:
                         for p in results:
                             if c in p:
@@ -69,4 +69,4 @@ def benchmark(config_path, collector):
 
         print(f'##Total points:\n\n{total_points}/{max_points}', file=log)
 
-    return total_points, max_points
\ No newline at end of file
+    return total_points, max_points
diff --git a/extensions/superboogav2/chat_handler.py b/extensions/superboogav2/chat_handler.py
index 419b926451..01ff58947f 100644
--- a/extensions/superboogav2/chat_handler.py
+++ b/extensions/superboogav2/chat_handler.py
@@ -4,16 +4,17 @@
 import re
 
 import extensions.superboogav2.parameters as parameters
-
+from extensions.superboogav2.utils import (
+    create_context_text,
+    create_metadata_source
+)
 from modules import chat, shared
-from modules.text_generation import get_encoded_length
-from modules.logging_colors import logger
 from modules.chat import load_character_memoized
-from extensions.superboogav2.utils import create_context_text, create_metadata_source
+from modules.logging_colors import logger
+from modules.text_generation import get_encoded_length
 
-from .data_processor import process_and_add_to_collector
 from .chromadb import ChromaCollector
-
+from .data_processor import process_and_add_to_collector
 
 CHAT_METADATA = create_metadata_source('automatic-chat-insert')
 
@@ -21,17 +22,17 @@
 def _remove_tag_if_necessary(user_input: str):
     if not parameters.get_is_manual():
         return user_input
-    
+
     return re.sub(r'^\s*!c\s*|\s*!c\s*$', '', user_input)
 
 
 def _should_query(input: str):
     if not parameters.get_is_manual():
         return True
-    
+
     if re.search(r'^\s*!c|!c\s*$', input, re.MULTILINE):
         return True
-    
+
     return False
 
 
@@ -69,7 +70,7 @@ def _concatinate_history(history: dict, state: dict):
         if len(exchange) >= 2:
             full_history_text += _format_single_exchange(bot_name, exchange[1])
 
-    return full_history_text[:-1] # Remove the last new line.
+    return full_history_text[:-1]  # Remove the last new line.
 
 
 def _hijack_last(context_text: str, history: dict, max_len: int, state: dict):
@@ -82,20 +83,20 @@ def _hijack_last(context_text: str, history: dict, max_len: int, state: dict):
     for i, messages in enumerate(reversed(history['internal'])):
         for j, message in enumerate(reversed(messages)):
             num_message_tokens = get_encoded_length(_format_single_exchange(names[j], message))
-            
+
             # TODO: This is an extremely naive solution. A more robust implementation must be made.
             if history_tokens + num_context_tokens <= max_len:
                 # This message can be replaced
                 replace_position = (i, j)
-            
+
             history_tokens += num_message_tokens
-    
+
     if replace_position is None:
         logger.warn("The provided context_text is too long to replace any message in the history.")
     else:
         # replace the message at replace_position with context_text
         i, j = replace_position
-        history['internal'][-i-1][-j-1] = context_text
+        history['internal'][-i - 1][-j - 1] = context_text
 
 
 def custom_generate_chat_prompt_internal(user_input: str, state: dict, collector: ChromaCollector, **kwargs):
@@ -120,5 +121,5 @@ def custom_generate_chat_prompt_internal(user_input: str, state: dict, collector
             user_input = create_context_text(results) + user_input
         elif parameters.get_injection_strategy() == parameters.HIJACK_LAST_IN_CONTEXT:
             _hijack_last(create_context_text(results), kwargs['history'], state['truncation_length'], state)
-    
+
     return chat.generate_chat_prompt(user_input, state, **kwargs)
diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py
index 0da2d8f90c..3381fb1436 100644
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@@ -1,42 +1,23 @@
-import threading
-import chromadb
-import posthog
-import torch
 import math
+import random
+import threading
 
+import chromadb
 import numpy as np
-import extensions.superboogav2.parameters as parameters
-
+import posthog
 from chromadb.config import Settings
-from sentence_transformers import SentenceTransformer
+from chromadb.utils import embedding_functions
 
+import extensions.superboogav2.parameters as parameters
 from modules.logging_colors import logger
-from modules.text_generation import encode, decode
+from modules.text_generation import decode, encode
 
-logger.debug('Intercepting all calls to posthog.')
+# Intercept calls to posthog
 posthog.capture = lambda *args, **kwargs: None
 
 
-class Collecter():
-    def __init__(self):
-        pass
-
-    def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int]):
-        pass
+embedder = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-mpnet-base-v2")
 
-    def get(self, search_strings: list[str], n_results: int) -> list[str]:
-        pass
-
-    def clear(self):
-        pass
-
-
-class Embedder():
-    def __init__(self):
-        pass
-
-    def embed(self, text: str) -> list[torch.Tensor]:
-        pass
 
 class Info:
     def __init__(self, start_index, text_with_context, distance, id):
@@ -58,7 +39,7 @@ def calculate_distance(self, other_info):
         elif parameters.get_new_dist_strategy() == parameters.DIST_ARITHMETIC_STRATEGY:
             # Arithmetic mean
             return (self.distance + other_info.distance) / 2
-        else: # Min is default
+        else:  # Min is default
             return min(self.distance, other_info.distance)
 
     def merge_with(self, other_info):
@@ -66,7 +47,7 @@ def merge_with(self, other_info):
         s2 = other_info.text_with_context
         s1_start = self.start_index
         s2_start = other_info.start_index
-        
+
         new_dist = self.calculate_distance(other_info)
 
         if self.should_merge(s1, s2, s1_start, s2_start):
@@ -84,55 +65,58 @@ def merge_with(self, other_info):
                     return Info(s2_start, s2 + s1[overlap:], new_dist, other_info.id)
 
         return None
-    
+
     @staticmethod
     def should_merge(s1, s2, s1_start, s2_start):
         # Check if s1 and s2 are adjacent or overlapping
         s1_end = s1_start + len(s1)
         s2_end = s2_start + len(s2)
-        
+
         return not (s1_end < s2_start or s2_end < s1_start)
 
-class ChromaCollector(Collecter):
-    def __init__(self, embedder: Embedder):
-        super().__init__()
+
+class ChromaCollector():
+    def __init__(self):
+        name = ''.join(random.choice('ab') for _ in range(10))
+
+        self.name = name
         self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
-        self.embedder = embedder
-        self.collection = self.chroma_client.create_collection(name="context", embedding_function=self.embedder.embed)
+        self.collection = self.chroma_client.create_collection(name=name, embedding_function=embedder)
+
         self.ids = []
         self.id_to_info = {}
         self.embeddings_cache = {}
-        self.lock = threading.Lock() # Locking so the server doesn't break.
+        self.lock = threading.Lock()  # Locking so the server doesn't break.
 
     def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int], metadatas: list[dict] = None):
         with self.lock:
             assert metadatas is None or len(metadatas) == len(texts), "metadatas must be None or have the same length as texts"
-            
-            if len(texts) == 0: 
+
+            if len(texts) == 0:
                 return
 
             new_ids = self._get_new_ids(len(texts))
 
             (existing_texts, existing_embeddings, existing_ids, existing_metas), \
-            (non_existing_texts, non_existing_ids, non_existing_metas) = self._split_texts_by_cache_hit(texts, new_ids, metadatas)
+                (non_existing_texts, non_existing_ids, non_existing_metas) = self._split_texts_by_cache_hit(texts, new_ids, metadatas)
 
             # If there are any already existing texts, add them all at once.
             if existing_texts:
                 logger.info(f'Adding {len(existing_embeddings)} cached embeddings.')
                 args = {'embeddings': existing_embeddings, 'documents': existing_texts, 'ids': existing_ids}
-                if metadatas is not None: 
+                if metadatas is not None:
                     args['metadatas'] = existing_metas
                 self.collection.add(**args)
 
             # If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.
             if non_existing_texts:
-                non_existing_embeddings = self.embedder.embed(non_existing_texts).tolist()
+                non_existing_embeddings = embedder(non_existing_texts)
                 for text, embedding in zip(non_existing_texts, non_existing_embeddings):
                     self.embeddings_cache[text] = embedding
 
                 logger.info(f'Adding {len(non_existing_embeddings)} new embeddings.')
                 args = {'embeddings': non_existing_embeddings, 'documents': non_existing_texts, 'ids': non_existing_ids}
-                if metadatas is not None: 
+                if metadatas is not None:
                     args['metadatas'] = non_existing_metas
                 self.collection.add(**args)
 
@@ -145,7 +129,6 @@ def add(self, texts: list[str], texts_with_context: list[str], starting_indices:
             self.id_to_info.update(new_info)
             self.ids.extend(new_ids)
 
-    
     def _split_texts_by_cache_hit(self, texts: list[str], new_ids: list[str], metadatas: list[dict]):
         existing_texts, non_existing_texts = [], []
         existing_embeddings = []
@@ -169,7 +152,6 @@ def _split_texts_by_cache_hit(self, texts: list[str], new_ids: list[str], metada
         return (existing_texts, existing_embeddings, existing_ids, existing_metas), \
                (non_existing_texts, non_existing_ids, non_existing_metas)
 
-
     def _get_new_ids(self, num_new_ids: int):
         if self.ids:
             max_existing_id = max(int(id_) for id_ in self.ids)
@@ -178,7 +160,6 @@ def _get_new_ids(self, num_new_ids: int):
 
         return [str(i + max_existing_id + 1) for i in range(num_new_ids)]
 
-    
     def _find_min_max_start_index(self):
         max_index, min_index = 0, float('inf')
         for _, val in self.id_to_info.items():
@@ -188,34 +169,34 @@ def _find_min_max_start_index(self):
                 min_index = val['start_index']
         return min_index, max_index
 
-
-    # NB: Does not make sense to weigh excerpts from different documents. 
+    # NB: Does not make sense to weigh excerpts from different documents.
     # But let's say that's the user's problem. Perfect world scenario:
     # Apply time weighing to different documents. For each document, then, add
     # separate time weighing.
+
     def _apply_sigmoid_time_weighing(self, infos: list[Info], document_len: int, time_steepness: float, time_power: float):
-        sigmoid = lambda x: 1 / (1 + np.exp(-x))
-        
+        def sigmoid(x):
+            return 1 / (1 + np.exp(-x))
+
         weights = sigmoid(time_steepness * np.linspace(-10, 10, document_len))
 
         # Scale to [0,time_power] and shift it up to [1-time_power, 1]
-        weights = weights - min(weights) 
+        weights = weights - min(weights)
         weights = weights * (time_power / max(weights))
-        weights = weights + (1 - time_power) 
+        weights = weights + (1 - time_power)
 
         # Reverse the weights
-        weights = weights[::-1]  
+        weights = weights[::-1]
 
         for info in infos:
             index = info.start_index
             info.distance *= weights[index]
 
-
     def _filter_outliers_by_median_distance(self, infos: list[Info], significant_level: float):
         # Ensure there are infos to filter
         if not infos:
             return []
-            
+
         # Find info with minimum distance
         min_info = min(infos, key=lambda x: x.distance)
 
@@ -231,7 +212,6 @@ def _filter_outliers_by_median_distance(self, infos: list[Info], significant_lev
 
         return filtered_infos
 
-
     def _merge_infos(self, infos: list[Info]):
         merged_infos = []
         current_info = infos[0]
@@ -247,8 +227,8 @@ def _merge_infos(self, infos: list[Info]):
         merged_infos.append(current_info)
         return merged_infos
 
-
     # Main function for retrieving chunks by distance. It performs merging, time weighing, and mean filtering.
+
     def _get_documents_ids_distances(self, search_strings: list[str], n_results: int):
         n_results = min(len(self.ids), n_results)
         if n_results == 0:
@@ -262,11 +242,11 @@ def _get_documents_ids_distances(self, search_strings: list[str], n_results: int
 
         for search_string in search_strings:
             result = self.collection.query(query_texts=search_string, n_results=math.ceil(n_results / len(search_strings)), include=['distances'])
-            curr_infos = [Info(start_index=self.id_to_info[id]['start_index'], 
-                               text_with_context=self.id_to_info[id]['text_with_context'], 
-                               distance=distance, id=id) 
+            curr_infos = [Info(start_index=self.id_to_info[id]['start_index'],
+                               text_with_context=self.id_to_info[id]['text_with_context'],
+                               distance=distance, id=id)
                           for id, distance in zip(result['ids'][0], result['distances'][0])]
-            
+
             self._apply_sigmoid_time_weighing(infos=curr_infos, document_len=max_start_index - min_start_index + 1, time_steepness=parameters.get_time_steepness(), time_power=parameters.get_time_power())
             curr_infos = self._filter_outliers_by_median_distance(curr_infos, parameters.get_significant_level())
             infos.extend(curr_infos)
@@ -279,23 +259,23 @@ def _get_documents_ids_distances(self, search_strings: list[str], n_results: int
         distances = [inf.distance for inf in infos]
 
         return texts_with_context, ids, distances
-    
 
     # Get chunks by similarity
+
     def get(self, search_strings: list[str], n_results: int) -> list[str]:
         with self.lock:
             documents, _, _ = self._get_documents_ids_distances(search_strings, n_results)
             return documents
-    
 
     # Get ids by similarity
+
     def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:
         with self.lock:
             _, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
             return ids
-    
-    
+
     # Cutoff token count
+
     def _get_documents_up_to_token_count(self, documents: list[str], max_token_count: int):
         # TODO: Move to caller; We add delimiters there which might go over the limit.
         current_token_count = 0
@@ -308,7 +288,7 @@ def _get_documents_up_to_token_count(self, documents: list[str], max_token_count
                 # If adding this document would exceed the max token count,
                 # truncate the document to fit within the limit.
                 remaining_tokens = max_token_count - current_token_count
-                
+
                 truncated_doc = decode(doc_tokens[:remaining_tokens], skip_special_tokens=True)
                 return_documents.append(truncated_doc)
                 break
@@ -317,29 +297,28 @@ def _get_documents_up_to_token_count(self, documents: list[str], max_token_count
                 current_token_count += doc_token_count
 
         return return_documents
-    
 
     # Get chunks by similarity and then sort by ids
+
     def get_sorted_by_ids(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
         with self.lock:
             documents, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
             sorted_docs = [x for _, x in sorted(zip(ids, documents))]
 
             return self._get_documents_up_to_token_count(sorted_docs, max_token_count)
-    
-    
+
     # Get chunks by similarity and then sort by distance (lowest distance is last).
+
     def get_sorted_by_dist(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
         with self.lock:
             documents, _, distances = self._get_documents_ids_distances(search_strings, n_results)
-            sorted_docs = [doc for doc, _ in sorted(zip(documents, distances), key=lambda x: x[1])] # sorted lowest -> highest
-            
+            sorted_docs = [doc for doc, _ in sorted(zip(documents, distances), key=lambda x: x[1])]  # sorted lowest -> highest
+
             # If a document is truncated or competely skipped, it would be with high distance.
             return_documents = self._get_documents_up_to_token_count(sorted_docs, max_token_count)
-            return_documents.reverse() # highest -> lowest
+            return_documents.reverse()  # highest -> lowest
 
             return return_documents
-    
 
     def delete(self, ids_to_delete: list[str], where: dict):
         with self.lock:
@@ -354,23 +333,16 @@ def delete(self, ids_to_delete: list[str], where: dict):
 
             logger.info(f'Successfully deleted {len(ids_to_delete)} records from chromaDB.')
 
-
     def clear(self):
         with self.lock:
             self.chroma_client.reset()
-            self.collection = self.chroma_client.create_collection("context", embedding_function=self.embedder.embed)
+
             self.ids = []
-            self.id_to_info = {}
+            self.chroma_client.delete_collection(name=self.name)
+            self.collection = self.chroma_client.create_collection(name=self.name, embedding_function=embedder)
 
             logger.info('Successfully cleared all records and reset chromaDB.')
 
 
-class SentenceTransformerEmbedder(Embedder):
-    def __init__(self) -> None:
-        logger.debug('Creating Sentence Embedder...')
-        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
-        self.embed = self.model.encode
-
-
 def make_collector():
-    return ChromaCollector(SentenceTransformerEmbedder())
\ No newline at end of file
+    return ChromaCollector()
diff --git a/extensions/superboogav2/data_preprocessor.py b/extensions/superboogav2/data_preprocessor.py
index cbd14b6b0c..1f354cf2e3 100644
--- a/extensions/superboogav2/data_preprocessor.py
+++ b/extensions/superboogav2/data_preprocessor.py
@@ -11,32 +11,29 @@
     * removing specific parts of speech (adverbs and interjections)
 - TextSummarizer extracts the most important sentences from a long string using text-ranking.
 """
-import pytextrank
-import string
-import spacy
 import math
-import nltk
 import re
+import string
 
+import nltk
+import spacy
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from num2words import num2words
 
 
 class TextPreprocessorBuilder:
-     # Define class variables as None initially
+    # Define class variables as None initially
     _stop_words = set(stopwords.words('english'))
     _lemmatizer = WordNetLemmatizer()
-    
+
     # Some of the functions are expensive. We cache the results.
     _lemmatizer_cache = {}
     _pos_remove_cache = {}
 
-
     def __init__(self, text: str):
         self.text = text
 
-
     def to_lower(self):
         # Match both words and non-word characters
         tokens = re.findall(r'\b\w+\b|\W+', self.text)
@@ -49,7 +46,6 @@ def to_lower(self):
         self.text = "".join(tokens)
         return self
 
-
     def num_to_word(self, min_len: int = 1):
         # Match both words and non-word characters
         tokens = re.findall(r'\b\w+\b|\W+', self.text)
@@ -58,11 +54,10 @@ def num_to_word(self, min_len: int = 1):
             if token.isdigit() and len(token) >= min_len:
                 # This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
                 # 740700 will become "seven hundred and forty thousand seven hundred".
-                tokens[i] = num2words(int(token)).replace(",","") # Remove commas from num2words.
+                tokens[i] = num2words(int(token)).replace(",", "")  # Remove commas from num2words.
         self.text = "".join(tokens)
         return self
 
-
     def num_to_char_long(self, min_len: int = 1):
         # Match both words and non-word characters
         tokens = re.findall(r'\b\w+\b|\W+', self.text)
@@ -71,11 +66,13 @@ def num_to_char_long(self, min_len: int = 1):
             if token.isdigit() and len(token) >= min_len:
                 # This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
                 # 740700 will become HHHHHHEEEEEAAAAHHHAAA
-                convert_token = lambda token: ''.join((chr(int(digit) + 65) * (i + 1)) for i, digit in enumerate(token[::-1]))[::-1]
+                def convert_token(token):
+                    return ''.join((chr(int(digit) + 65) * (i + 1)) for i, digit in enumerate(token[::-1]))[::-1]
+
                 tokens[i] = convert_token(tokens[i])
         self.text = "".join(tokens)
         return self
-    
+
     def num_to_char(self, min_len: int = 1):
         # Match both words and non-word characters
         tokens = re.findall(r'\b\w+\b|\W+', self.text)
@@ -87,15 +84,15 @@ def num_to_char(self, min_len: int = 1):
                 tokens[i] = ''.join(chr(int(digit) + 65) for digit in token)
         self.text = "".join(tokens)
         return self
-    
+
     def merge_spaces(self):
         self.text = re.sub(' +', ' ', self.text)
         return self
-    
+
     def strip(self):
         self.text = self.text.strip()
         return self
-        
+
     def remove_punctuation(self):
         self.text = self.text.translate(str.maketrans('', '', string.punctuation))
         return self
@@ -103,7 +100,7 @@ def remove_punctuation(self):
     def remove_stopwords(self):
         self.text = "".join([word for word in re.findall(r'\b\w+\b|\W+', self.text) if word not in TextPreprocessorBuilder._stop_words])
         return self
-    
+
     def remove_specific_pos(self):
         """
         In the English language, adverbs and interjections rarely provide meaningul information.
@@ -140,7 +137,7 @@ def lemmatize(self):
         if processed_text:
             self.text = processed_text
             return self
-        
+
         new_text = "".join([TextPreprocessorBuilder._lemmatizer.lemmatize(word) for word in re.findall(r'\b\w+\b|\W+', self.text)])
         TextPreprocessorBuilder._lemmatizer_cache[self.text] = new_text
         self.text = new_text
@@ -150,6 +147,7 @@ def lemmatize(self):
     def build(self):
         return self.text
 
+
 class TextSummarizer:
     _nlp_pipeline = None
     _cache = {}
@@ -165,7 +163,7 @@ def _load_nlp_pipeline():
     @staticmethod
     def process_long_text(text: str, min_num_sent: int) -> list[str]:
         """
-        This function applies a text summarization process on a given text string, extracting 
+        This function applies a text summarization process on a given text string, extracting
         the most important sentences based on the principle that 20% of the content is responsible
         for 80% of the meaning (the Pareto Principle).
 
@@ -193,7 +191,7 @@ def process_long_text(text: str, min_num_sent: int) -> list[str]:
 
         else:
             result = [text]
-        
+
         # Store the result in cache before returning it
         TextSummarizer._cache[cache_key] = result
-        return result
\ No newline at end of file
+        return result
diff --git a/extensions/superboogav2/data_processor.py b/extensions/superboogav2/data_processor.py
index f019f427fe..0a96d4a43b 100644
--- a/extensions/superboogav2/data_processor.py
+++ b/extensions/superboogav2/data_processor.py
@@ -1,16 +1,17 @@
 """
-This module is responsible for processing the corpus and feeding it into chromaDB. It will receive a corpus of text. 
+This module is responsible for processing the corpus and feeding it into chromaDB. It will receive a corpus of text.
 It will then split it into chunks of specified length. For each of those chunks, it will append surrounding context.
 It will only include full words.
 """
 
-import re
 import bisect
+import re
 
 import extensions.superboogav2.parameters as parameters
 
-from .data_preprocessor import TextPreprocessorBuilder, TextSummarizer
 from .chromadb import ChromaCollector
+from .data_preprocessor import TextPreprocessorBuilder, TextSummarizer
+
 
 def preprocess_text_no_summary(text) -> str:
     builder = TextPreprocessorBuilder(text)
@@ -42,7 +43,7 @@ def preprocess_text_no_summary(text) -> str:
             builder.num_to_char(parameters.get_min_num_length())
         elif parameters.get_num_conversion_strategy() == parameters.NUM_TO_CHAR_LONG_METHOD:
             builder.num_to_char_long(parameters.get_min_num_length())
-    
+
     return builder.build()
 
 
@@ -53,10 +54,10 @@ def preprocess_text(text) -> list[str]:
 
 def _create_chunks_with_context(corpus, chunk_len, context_left, context_right):
     """
-    This function takes a corpus of text and splits it into chunks of a specified length, 
-    then adds a specified amount of context to each chunk. The context is added by first 
-    going backwards from the start of the chunk and then going forwards from the end of the 
-    chunk, ensuring that the context includes only whole words and that the total context length 
+    This function takes a corpus of text and splits it into chunks of a specified length,
+    then adds a specified amount of context to each chunk. The context is added by first
+    going backwards from the start of the chunk and then going forwards from the end of the
+    chunk, ensuring that the context includes only whole words and that the total context length
     does not exceed the specified limit. This function uses binary search for efficiency.
 
     Returns:
@@ -102,7 +103,7 @@ def _create_chunks_with_context(corpus, chunk_len, context_left, context_right):
         # Combine all the words in the context range (before, chunk, and after)
         chunk_with_context = ''.join(words[context_start_index:context_end_index])
         chunks_with_context.append(chunk_with_context)
-        
+
         # Determine the start index of the chunk with context
         chunk_with_context_start_index = word_start_indices[context_start_index]
         chunk_with_context_start_indices.append(chunk_with_context_start_index)
@@ -125,9 +126,9 @@ def _clear_chunks(data_chunks, data_chunks_with_context, data_chunk_starting_ind
         seen_chunk_start = seen_chunks.get(chunk)
         if seen_chunk_start:
             # If we've already seen this exact chunk, and the context around it it very close to the seen chunk, then skip it.
-            if abs(seen_chunk_start-index) < parameters.get_delta_start():
+            if abs(seen_chunk_start - index) < parameters.get_delta_start():
                 continue
-        
+
         distinct_data_chunks.append(chunk)
         distinct_data_chunks_with_context.append(context)
         distinct_data_chunk_starting_indices.append(index)
@@ -206,4 +207,4 @@ def process_and_add_to_collector(corpus: str, collector: ChromaCollector, clear_
 
     if clear_collector_before_adding:
         collector.clear()
-    collector.add(data_chunks, data_chunks_with_context, data_chunk_starting_indices, [metadata]*len(data_chunks) if metadata is not None else None)
\ No newline at end of file
+    collector.add(data_chunks, data_chunks_with_context, data_chunk_starting_indices, [metadata] * len(data_chunks) if metadata is not None else None)
diff --git a/extensions/superboogav2/download_urls.py b/extensions/superboogav2/download_urls.py
index ad2726b563..5b5a2e17ac 100644
--- a/extensions/superboogav2/download_urls.py
+++ b/extensions/superboogav2/download_urls.py
@@ -1,7 +1,7 @@
 import concurrent.futures
-import requests
 import re
 
+import requests
 from bs4 import BeautifulSoup
 
 import extensions.superboogav2.parameters as parameters
@@ -9,6 +9,7 @@
 from .data_processor import process_and_add_to_collector
 from .utils import create_metadata_source
 
+
 def _download_single(url):
     response = requests.get(url, timeout=5)
     if response.status_code == 200:
@@ -62,4 +63,4 @@ def feed_url_into_collector(urls, collector):
         text = '\n'.join([s.strip() for s in strings])
         all_text += text
 
-    process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))
\ No newline at end of file
+    process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))
diff --git a/extensions/superboogav2/notebook_handler.py b/extensions/superboogav2/notebook_handler.py
index 7b86434969..d07a2098ab 100644
--- a/extensions/superboogav2/notebook_handler.py
+++ b/extensions/superboogav2/notebook_handler.py
@@ -4,13 +4,12 @@
 import re
 
 import extensions.superboogav2.parameters as parameters
-
-from modules import shared
-from modules.logging_colors import logger
 from extensions.superboogav2.utils import create_context_text
+from modules.logging_colors import logger
 
 from .data_processor import preprocess_text
 
+
 def _remove_special_tokens(string):
     pattern = r'(<\|begin-user-input\|>|<\|end-user-input\|>|<\|injection-point\|>)'
     return re.sub(pattern, '', string)
@@ -37,4 +36,4 @@ def input_modifier_internal(string, collector, is_chat):
         # Make the injection
         string = string.replace('<|injection-point|>', create_context_text(results))
 
-    return _remove_special_tokens(string)
\ No newline at end of file
+    return _remove_special_tokens(string)
diff --git a/extensions/superboogav2/optimize.py b/extensions/superboogav2/optimize.py
index acebf21248..ebdd03c6e2 100644
--- a/extensions/superboogav2/optimize.py
+++ b/extensions/superboogav2/optimize.py
@@ -3,22 +3,24 @@
 
 Each run, the optimizer will set the default values inside the hyperparameters. At the end, it will output the best ones it has found.
 """
-import re
+import hashlib
 import json
-import optuna
+import logging
+import re
+
 import gradio as gr
 import numpy as np
-import logging
-import hashlib
-logging.getLogger('optuna').setLevel(logging.WARNING)
+import optuna
 
-import extensions.superboogav2.parameters as parameters
+logging.getLogger('optuna').setLevel(logging.WARNING)
 
 from pathlib import Path
 
+import extensions.superboogav2.parameters as parameters
+from modules.logging_colors import logger
+
 from .benchmark import benchmark
 from .parameters import Parameters
-from modules.logging_colors import logger
 
 
 # Format the parameters into markdown format.
@@ -28,7 +30,7 @@ def _markdown_hyperparams():
         # Escape any markdown syntax
         param_name = re.sub(r"([_*\[\]()~`>#+-.!])", r"\\\1", param_name)
         param_value_default = re.sub(r"([_*\[\]()~`>#+-.!])", r"\\\1", str(param_value['default'])) if param_value['default'] else ' '
-        
+
         res.append('* {}: **{}**'.format(param_name, param_value_default))
 
     return '\n'.join(res)
@@ -49,13 +51,13 @@ def _convert_np_types(params):
 # Set the default values for the hyperparameters.
 def _set_hyperparameters(params):
     for param_name, param_value in params.items():
-        if param_name in Parameters.getInstance().hyperparameters: 
+        if param_name in Parameters.getInstance().hyperparameters:
             Parameters.getInstance().hyperparameters[param_name]['default'] = param_value
 
 
 # Check if the parameter is for optimization.
 def _is_optimization_param(val):
-    is_opt = val.get('should_optimize', False) # Either does not exist or is false
+    is_opt = val.get('should_optimize', False)  # Either does not exist or is false
     return is_opt
 
 
@@ -67,7 +69,7 @@ def _get_params_hash(params):
 
 def optimize(collector, progress=gr.Progress()):
     # Inform the user that something is happening.
-    progress(0, desc=f'Setting Up...')
+    progress(0, desc='Setting Up...')
 
     # Track the current step
     current_step = 0
@@ -132,4 +134,4 @@ def objective_function(trial):
     with open('best_params.json', 'w') as fp:
         json.dump(_convert_np_types(best_params), fp, indent=4)
 
-    return str_result
\ No newline at end of file
+    return str_result
diff --git a/extensions/superboogav2/parameters.py b/extensions/superboogav2/parameters.py
index 1cada46a23..8bb2d1a6fb 100644
--- a/extensions/superboogav2/parameters.py
+++ b/extensions/superboogav2/parameters.py
@@ -1,18 +1,16 @@
 """
-This module provides a singleton class `Parameters` that is used to manage all hyperparameters for the embedding application. 
+This module provides a singleton class `Parameters` that is used to manage all hyperparameters for the embedding application.
 It expects a JSON file in `extensions/superboogav2/config.json`.
 
-Each element in the JSON must have a `default` value which will be used for the current run. Elements can have `categories`. 
-These categories define the range in which the optimizer will search. If the element is tagged with `"should_optimize": false`, 
+Each element in the JSON must have a `default` value which will be used for the current run. Elements can have `categories`.
+These categories define the range in which the optimizer will search. If the element is tagged with `"should_optimize": false`,
 then the optimizer will only ever use the default value.
 """
-from pathlib import Path
-
 import json
+from pathlib import Path
 
 from modules.logging_colors import logger
 
-
 NUM_TO_WORD_METHOD = 'Number to Word'
 NUM_TO_CHAR_METHOD = 'Number to Char'
 NUM_TO_CHAR_LONG_METHOD = 'Number to Multi-Char'
@@ -366,4 +364,4 @@ def set_api_port(value: int):
 
 
 def set_api_on(value: bool):
-    Parameters.getInstance().hyperparameters['api_on']['default'] = value
\ No newline at end of file
+    Parameters.getInstance().hyperparameters['api_on']['default'] = value
diff --git a/extensions/superboogav2/requirements.txt b/extensions/superboogav2/requirements.txt
index 748bacf1ab..d9031167de 100644
--- a/extensions/superboogav2/requirements.txt
+++ b/extensions/superboogav2/requirements.txt
@@ -1,5 +1,5 @@
 beautifulsoup4==4.12.2
-chromadb==0.3.18
+chromadb==0.4.24
 lxml
 optuna
 pandas==2.0.3
@@ -7,4 +7,4 @@ posthog==2.4.2
 sentence_transformers==2.2.2
 spacy
 pytextrank
-num2words
\ No newline at end of file
+num2words
diff --git a/extensions/superboogav2/script.py b/extensions/superboogav2/script.py
index 66f56e29ea..77c5cced78 100644
--- a/extensions/superboogav2/script.py
+++ b/extensions/superboogav2/script.py
@@ -7,28 +7,29 @@
 # Point to where nltk will find the required data.
 os.environ['NLTK_DATA'] = str(Path("extensions/superboogav2/nltk_data").resolve())
 
-import textwrap
 import codecs
+import textwrap
+
 import gradio as gr
 
 import extensions.superboogav2.parameters as parameters
-
-from modules.logging_colors import logger
 from modules import shared
+from modules.logging_colors import logger
 
-from .utils import create_metadata_source
+from .api import APIManager
+from .benchmark import benchmark
+from .chat_handler import custom_generate_chat_prompt_internal
 from .chromadb import make_collector
-from .download_urls import feed_url_into_collector
 from .data_processor import process_and_add_to_collector
-from .benchmark import benchmark
-from .optimize import optimize
+from .download_urls import feed_url_into_collector
 from .notebook_handler import input_modifier_internal
-from .chat_handler import custom_generate_chat_prompt_internal
-from .api import APIManager
+from .optimize import optimize
+from .utils import create_metadata_source
 
 collector = None
 api_manager = None
 
+
 def setup():
     global collector
     global api_manager
@@ -38,6 +39,7 @@ def setup():
     if parameters.get_api_on():
         api_manager.start_server(parameters.get_api_port())
 
+
 def _feed_data_into_collector(corpus):
     yield '### Processing data...'
     process_and_add_to_collector(corpus, collector, False, create_metadata_source('direct-text'))
@@ -87,7 +89,7 @@ def _get_optimizable_settings() -> list:
         preprocess_pipeline.append('Merge Spaces')
     if parameters.should_strip():
         preprocess_pipeline.append('Strip Edges')
-        
+
     return [
         parameters.get_time_power(),
         parameters.get_time_steepness(),
@@ -104,8 +106,8 @@ def _get_optimizable_settings() -> list:
     ]
 
 
-def _apply_settings(optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion, 
-                    preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count, 
+def _apply_settings(optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
+                    preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
                     chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup):
     logger.debug('Applying settings.')
 
@@ -240,7 +242,7 @@ def ui():
             with gr.Tab("File input"):
                 file_input = gr.File(label='Input file', type='binary')
                 update_file = gr.Button('Load data')
-                
+
             with gr.Tab("Settings"):
                 with gr.Accordion("Processing settings", open=True):
                     chunk_len = gr.Textbox(value=parameters.get_chunk_len(), label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
@@ -305,19 +307,16 @@ def ui():
                 optimize_button = gr.Button('Optimize')
                 optimization_steps = gr.Number(value=parameters.get_optimization_steps(), label='Optimization Steps', info='For how many steps to optimize.', interactive=True)
 
-
             clear_button = gr.Button('❌ Clear Data')
 
-            
         with gr.Column():
             last_updated = gr.Markdown()
 
-    all_params = [optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion, 
-                  preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count, 
+    all_params = [optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
+                  preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
                   chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup]
-    optimizable_params = [time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion, 
-                  preprocess_pipeline, chunk_count, context_len, chunk_len]
-
+    optimizable_params = [time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
+                          preprocess_pipeline, chunk_count, context_len, chunk_len]
 
     update_data.click(_feed_data_into_collector, [data_input], last_updated, show_progress=False)
     update_url.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
@@ -326,7 +325,6 @@ def ui():
     optimize_button.click(_begin_optimization, [], [last_updated] + optimizable_params, show_progress=True)
     clear_button.click(_clear_data, [], last_updated, show_progress=False)
 
-
     optimization_steps.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     time_power.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     time_steepness.input(fn=_apply_settings, inputs=all_params, show_progress=False)
@@ -352,4 +350,4 @@ def ui():
     chunk_regex.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     chunk_len.input(fn=_apply_settings, inputs=all_params, show_progress=False)
     threads.input(fn=_apply_settings, inputs=all_params, show_progress=False)
-    strong_cleanup.input(fn=_apply_settings, inputs=all_params, show_progress=False)
\ No newline at end of file
+    strong_cleanup.input(fn=_apply_settings, inputs=all_params, show_progress=False)
diff --git a/extensions/superboogav2/utils.py b/extensions/superboogav2/utils.py
index 89b367eacc..df84650b51 100644
--- a/extensions/superboogav2/utils.py
+++ b/extensions/superboogav2/utils.py
@@ -4,6 +4,7 @@
 
 import extensions.superboogav2.parameters as parameters
 
+
 # Create the context using the prefix + data_separator + postfix from parameters.
 def create_context_text(results):
     context = parameters.get_prefix() + parameters.get_data_separator().join(results) + parameters.get_postfix()
@@ -13,4 +14,4 @@ def create_context_text(results):
 
 # Create metadata with the specified source
 def create_metadata_source(source: str):
-    return {'source': source}
\ No newline at end of file
+    return {'source': source}

From 0e6eb7c27a877c3b18e7125995e5bc47b7f74767 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Mar 2024 17:30:36 -0300
Subject: [PATCH 03/17] Add AQLM support (transformers loader) (#5466)

---
 requirements.txt                 | 1 +
 requirements_amd.txt             | 1 +
 requirements_amd_noavx2.txt      | 1 +
 requirements_apple_intel.txt     | 1 +
 requirements_apple_silicon.txt   | 1 +
 requirements_cpu_only.txt        | 1 +
 requirements_cpu_only_noavx2.txt | 1 +
 requirements_noavx2.txt          | 1 +
 requirements_nowheels.txt        | 1 +
 9 files changed, 9 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 3028fe7abd..ed4941d426 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_amd.txt b/requirements_amd.txt
index f43f6ac45e..b0d7dc7569 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 37dfcea2fc..f8c53550f6 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 88d578ad06..cb2254af1c 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 13767480da..2ad171218b 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 92b5b969b8..25bf229232 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 01964fde73..02bbf2c0a9 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index a4f9bcb966..c81a547294 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 7166015dd4..a717931500 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,4 +1,5 @@
 accelerate==0.27.*
+aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops

From d0663bae31cabc9c15aba009c0d65a577f75d126 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Mar 2024 17:36:28 -0300
Subject: [PATCH 04/17] Bump AutoAWQ to 0.2.3 (Linux only) (#5658)

---
 requirements.txt            | 2 +-
 requirements_amd.txt        | 2 ++
 requirements_amd_noavx2.txt | 2 ++
 requirements_noavx2.txt     | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ed4941d426..5fccb8260d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -71,4 +71,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl
-autoawq==0.1.8; platform_system == "Linux" or platform_system == "Windows"
+autoawq==0.2.3; platform_system == "Linux"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index b0d7dc7569..01da808525 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -46,3 +46,5 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.1
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index f8c53550f6..fe56bf7981 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -44,3 +44,5 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.1
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index c81a547294..3058114a67 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -71,4 +71,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl
-autoawq==0.1.8; platform_system == "Linux" or platform_system == "Windows"
+autoawq==0.2.3; platform_system == "Linux"

From 238f69accc254cbad126246eaf2607317b822a1d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Mar 2024 12:52:52 -0800
Subject: [PATCH 05/17] Move "Command for chat-instruct mode" to the main chat
 tab (closes #5634)

---
 modules/ui_chat.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index ad4a4f0fda..7255bb9987 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -88,6 +88,9 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
+                with gr.Row():
+                    shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=16, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
+
 
 def create_chat_settings_ui():
     mu = shared.args.multi_user
@@ -133,7 +136,6 @@ def create_chat_settings_ui():
 
             with gr.Column():
                 shared.gradio['chat_template_str'] = gr.Textbox(value=shared.settings['chat_template_str'], label='Chat template', lines=22, elem_classes=['add_scrollbar', 'monospace'])
-                shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
 
     with gr.Tab('Chat history'):
         with gr.Row():
@@ -293,7 +295,7 @@ def create_event_handlers():
         lambda: None, None, None, _js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
 
     shared.gradio['mode'].change(
-        lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
+        lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
         chat.redraw_html, gradio(reload_arr), gradio('display')).then(

From 549bb88975e119666dcf394b7c7320650aa299b7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Mar 2024 12:54:30 -0800
Subject: [PATCH 06/17] Increase height of "Custom stopping strings" UI field

---
 modules/ui_parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 7aebe67224..f809b5350f 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -76,7 +76,7 @@ def create_ui(default_preset):
                             shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
                             shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
 
-                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
+                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
                             shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in the Default or Notebook tab.')
 
                         with gr.Column():

From 9271e80914b62fa225e9e5c8c37c961eccaba2be Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Mar 2024 14:54:56 -0800
Subject: [PATCH 07/17] Add back AutoAWQ for Windows

https://github.com/casper-hansen/AutoAWQ/issues/377#issuecomment-1986440695
---
 requirements.txt        | 2 +-
 requirements_noavx2.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5fccb8260d..21e6a72929 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -71,4 +71,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl
-autoawq==0.2.3; platform_system == "Linux"
+autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 3058114a67..f7ee0e233e 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -71,4 +71,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl
-autoawq==0.2.3; platform_system == "Linux"
+autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"

From afb51bd5d64d6698e42b5b520562ccae269caf77 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Mar 2024 00:25:33 -0300
Subject: [PATCH 08/17] Add StreamingLLM for llamacpp & llamacpp_HF (2nd
 attempt) (#5669)

---
 modules/cache_utils.py             | 108 +++++++++++++++++++++++++++++
 modules/llama_cpp_python_hijack.py |  22 ++++++
 modules/loaders.py                 |   4 ++
 modules/shared.py                  |   2 +
 modules/text_generation.py         |   7 ++
 modules/ui.py                      |   2 +
 modules/ui_model_menu.py           |   2 +
 7 files changed, 147 insertions(+)
 create mode 100644 modules/cache_utils.py

diff --git a/modules/cache_utils.py b/modules/cache_utils.py
new file mode 100644
index 0000000000..3a200d8e69
--- /dev/null
+++ b/modules/cache_utils.py
@@ -0,0 +1,108 @@
+import torch
+
+from modules import shared
+from modules.logging_colors import logger
+
+
+def process_llamacpp_cache(model, new_sequence, past_sequence):
+    i1, i2, j1, j2 = find_longest_common_substring_indices(past_sequence, new_sequence)
+    overlap_length = i2 - i1 + 1
+
+    # Do StreamingLLM if i1 > 0 (ie the longest common subsequence is not a prefix)
+    # and the overlap length is sufficiently long.
+    if i1 > 0 and overlap_length > 0.2 * len(new_sequence):
+
+        new_sequence = torch.tensor(new_sequence)
+        past_sequence = torch.tensor(past_sequence)
+
+        prefix_length = find_prefix_length(past_sequence[:i1], new_sequence[:j1])
+        sink_length = prefix_length
+        if sink_length < shared.args.attention_sink_size:
+            sink_length = shared.args.attention_sink_size
+
+        removed_length = i1 - sink_length
+
+        matching_prefix = past_sequence[:prefix_length]
+        removed_chunk = past_sequence[sink_length:i1]
+        overlapping_sequence = new_sequence[j1:j2 + 1]
+        added_chunk = new_sequence[j2 + 1:]
+
+        # print(past_sequence)
+        # print(new_sequence)
+
+        print()
+        print('MATCHING PREFIX=', repr(shared.tokenizer.decode(matching_prefix)))
+        print('ADDED CHUNK=', repr(shared.tokenizer.decode(added_chunk)))
+        print('REMOVED CHUNK=', repr(shared.tokenizer.decode(removed_chunk)))
+        print()
+
+        # Remove interval [sink_length, sink_length + removed_length) from the context
+        # Subtract removed_length from model.n_tokens
+        model._ctx.kv_cache_seq_rm(0, sink_length, sink_length + removed_length)
+        model._ctx.kv_cache_seq_shift(0, sink_length + removed_length, -1, -removed_length)
+
+        new_sequence = new_sequence.tolist()
+        model.input_ids[:j2 + 1] = new_sequence[:j2 + 1]
+        model.n_tokens = j2 + 1
+
+        return new_sequence[:j2 + 1]
+    else:
+        return past_sequence
+
+
+def find_prefix_length(past_seq, seq_tensor):
+    '''
+    Given two torch tensors, finds the length of the longest
+    common prefix between the two.
+    '''
+    min_length = min(past_seq.shape[0], seq_tensor.shape[0])
+    indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
+    if len(indices) > 0:
+        prefix_length = indices[0].item()
+    else:
+        prefix_length = min_length
+
+    return prefix_length
+
+
+def find_longest_common_substring_indices(list1, list2):
+    '''
+    Given two lists, solves the Longest Common Substring problem.
+
+    It returns the indices where the substring starts and ends in
+    s1 and s2.
+
+    Example:
+
+    ir, jr, ir2, jr2 = find_longest_common_substring_indices(s1, s2)
+    print(s1[ir:jr + 1])
+    print(s2[ir2:jr2 + 1])
+
+    Adapted from
+    https://rosettacode.org/wiki/Longest_common_substring#Python
+    '''
+
+    len_list1, len_list2 = len(list1), len(list2)
+    start_index_list1, end_index_list1 = 0, -1
+    start_index_list2, end_index_list2 = 0, -1
+
+    for index1 in range(len_list1):
+        try:
+            index2 = list2.index(list1[index1])
+        except ValueError:
+            continue
+        while index2 >= 0:
+            temp_index1, temp_index2 = index1, index2
+            while temp_index1 < len_list1 and temp_index2 < len_list2 and list2[temp_index2] == list1[temp_index1]:
+                if temp_index1 - index1 >= end_index_list1 - start_index_list1:
+                    start_index_list1, end_index_list1 = index1, temp_index1
+                    start_index_list2, end_index_list2 = index2, temp_index2
+
+                temp_index1 += 1
+                temp_index2 += 1
+            try:
+                index2 = list2.index(list1[index1], index2 + 1)
+            except ValueError:
+                break
+
+    return start_index_list1, end_index_list1, start_index_list2, end_index_list2
diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
index 9bb38512e5..96de839e01 100644
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@@ -2,6 +2,9 @@
 
 from tqdm import tqdm
 
+from modules import shared
+from modules.cache_utils import process_llamacpp_cache
+
 try:
     import llama_cpp
 except:
@@ -58,6 +61,25 @@ def eval_with_progress(self, tokens: Sequence[int]):
         self.n_tokens += n_tokens
 
 
+def monkey_patch_generate(lib):
+
+    def my_generate(self, *args, **kwargs):
+
+        if shared.args.streaming_llm:
+            new_sequence = args[0]
+            past_sequence = self._input_ids
+
+            # Do the cache trimming for StreamingLLM
+            process_llamacpp_cache(self, new_sequence, past_sequence)
+
+        for output in self.original_generate(*args, **kwargs):
+            yield output
+
+    lib.Llama.original_generate = lib.Llama.generate
+    lib.Llama.generate = my_generate
+
+
 for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
     if lib is not None:
         lib.Llama.eval = eval_with_progress
+        monkey_patch_generate(lib)
diff --git a/modules/loaders.py b/modules/loaders.py
index 330f290389..f1c44a903c 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -46,6 +46,8 @@
         'no_offload_kqv',
         'row_split',
         'tensorcores',
+        'streaming_llm',
+        'attention_sink_size',
     ],
     'llamacpp_HF': [
         'n_ctx',
@@ -69,6 +71,8 @@
         'no_offload_kqv',
         'row_split',
         'tensorcores',
+        'streaming_llm',
+        'attention_sink_size',
         'llamacpp_HF_info',
     ],
     'ExLlamav2_HF': [
diff --git a/modules/shared.py b/modules/shared.py
index 10a70001cc..8758cee1ca 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -130,6 +130,8 @@
 group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
 group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
+group.add_argument('--streaming-llm', action='store_true', help='Activates StreamingLLM, which prevents the prompt from ever being reevaluated when old chat messages are removed due to the context length for the model being reached.')
+group.add_argument('--attention-sink-size', type=int, default=5, help='Minimum attention sink length from StreamingLLM.')
 
 # ExLlamaV2
 group = parser.add_argument_group('ExLlamaV2')
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 227d1822d1..dc9c63eaf1 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -13,6 +13,7 @@
 from transformers import LogitsProcessorList, is_torch_xpu_available
 
 import modules.shared as shared
+from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
     Iteratorize,
     Stream,
@@ -364,6 +365,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         print(decode(input_ids[0], skip_special_tokens=False))
         print()
 
+    # Handle StreamingLLM for llamacpp_HF
+    if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm:
+        tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids)
+        shared.model.past_seq = torch.tensor(tmp)
+        shared.model.save_cache()
+
     t0 = time.time()
     try:
         if not is_chat and not shared.is_seq2seq:
diff --git a/modules/ui.py b/modules/ui.py
index 6e1b12b0bf..4a03f8432c 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -97,6 +97,8 @@ def list_model_elements():
         'no_offload_kqv',
         'row_split',
         'tensorcores',
+        'streaming_llm',
+        'attention_sink_size',
         'hqq_backend',
     ]
     if is_torch_xpu_available():
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index c29db7d0d9..e3b0e883f3 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -117,6 +117,8 @@ def create_ui():
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
+                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size)
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                             shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                             shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')

From cf0697936a1f1434f6064747f80a6acb3d861fb9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 8 Mar 2024 21:39:02 -0800
Subject: [PATCH 09/17] Optimize StreamingLLM by over 10x

---
 modules/cache_utils.py     | 14 ++++++++++----
 modules/text_generation.py |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/modules/cache_utils.py b/modules/cache_utils.py
index 3a200d8e69..3f5a0f31ae 100644
--- a/modules/cache_utils.py
+++ b/modules/cache_utils.py
@@ -1,10 +1,13 @@
 import torch
+from numba import njit
 
 from modules import shared
-from modules.logging_colors import logger
 
 
 def process_llamacpp_cache(model, new_sequence, past_sequence):
+    if len(past_sequence) == 0 or len(new_sequence) == 0:
+        return past_sequence
+
     i1, i2, j1, j2 = find_longest_common_substring_indices(past_sequence, new_sequence)
     overlap_length = i2 - i1 + 1
 
@@ -65,6 +68,7 @@ def find_prefix_length(past_seq, seq_tensor):
     return prefix_length
 
 
+@njit
 def find_longest_common_substring_indices(list1, list2):
     '''
     Given two lists, solves the Longest Common Substring problem.
@@ -86,11 +90,13 @@ def find_longest_common_substring_indices(list1, list2):
     start_index_list1, end_index_list1 = 0, -1
     start_index_list2, end_index_list2 = 0, -1
 
-    for index1 in range(len_list1):
+    # for index1 in tqdm(range(0, len_list1), desc="StreamingLLM prompt comparison", leave=False):
+    for index1 in range(0, len_list1):
         try:
             index2 = list2.index(list1[index1])
-        except ValueError:
+        except:
             continue
+
         while index2 >= 0:
             temp_index1, temp_index2 = index1, index2
             while temp_index1 < len_list1 and temp_index2 < len_list2 and list2[temp_index2] == list1[temp_index1]:
@@ -102,7 +108,7 @@ def find_longest_common_substring_indices(list1, list2):
                 temp_index2 += 1
             try:
                 index2 = list2.index(list1[index1], index2 + 1)
-            except ValueError:
+            except:
                 break
 
     return start_index_list1, end_index_list1, start_index_list2, end_index_list2
diff --git a/modules/text_generation.py b/modules/text_generation.py
index dc9c63eaf1..d1a59a9d98 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -367,7 +367,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
 
     # Handle StreamingLLM for llamacpp_HF
     if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm:
-        tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids)
+        tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids.tolist())
         shared.model.past_seq = torch.tensor(tmp)
         shared.model.save_cache()
 

From 52a34921efc5953fe3a583b69b1dc26ca43100f9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 9 Mar 2024 16:33:12 -0800
Subject: [PATCH 10/17] Installer: validate the checksum for the miniconda
 installer on Windows

---
 start_windows.bat | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/start_windows.bat b/start_windows.bat
index 900ae7a407..ebcc199706 100755
--- a/start_windows.bat
+++ b/start_windows.bat
@@ -1,4 +1,5 @@
 @echo off
+setlocal enabledelayedexpansion
 
 cd /D "%~dp0"
 
@@ -25,6 +26,7 @@ set INSTALL_DIR=%cd%\installer_files
 set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
 set INSTALL_ENV_DIR=%cd%\installer_files\env
 set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Windows-x86_64.exe
+set MINICONDA_CHECKSUM=307194e1f12bbeb52b083634e89cc67db4f7980bd542254b43d3309eaf7cb358
 set conda_exists=F
 
 @rem figure out whether git and conda needs to be installed
@@ -39,6 +41,18 @@ if "%conda_exists%" == "F" (
 	mkdir "%INSTALL_DIR%"
 	call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end )
 
+	for /f %%a in ('CertUtil -hashfile "%INSTALL_DIR%\miniconda_installer.exe" SHA256 ^| find /i /v " " ^| find /i "%MINICONDA_CHECKSUM%"') do (
+		set "output=%%a"
+	)
+
+	if not defined output (
+		echo The checksum verification for miniconda_installer.exe has failed.
+		del "%INSTALL_DIR%\miniconda_installer.exe"
+		goto end
+	) else (
+		echo The checksum verification for miniconda_installer.exe has passed successfully.
+	)
+
 	echo Installing Miniconda to %CONDA_ROOT_PREFIX%
 	start /wait "" "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX%
 
@@ -46,8 +60,8 @@ if "%conda_exists%" == "F" (
 	echo Miniconda version:
 	call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end )
 
-    @rem delete the Miniconda installer
-    del "%INSTALL_DIR%\miniconda_installer.exe"
+	@rem delete the Miniconda installer
+	del "%INSTALL_DIR%\miniconda_installer.exe"
 )
 
 @rem create the installer env

From 763f9beb7e0233a1caf700fbc4d7a0020ba08162 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 08:30:53 -0700
Subject: [PATCH 11/17] Bump bitsandbytes to 0.43, add official Windows wheel

---
 requirements.txt        | 5 +----
 requirements_noavx2.txt | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 21e6a72929..9c4c9a6578 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 accelerate==0.27.*
 aqlm[gpu,cpu]==1.1.0
+bitsandbytes==0.43.*
 colorama
 datasets
 einops
@@ -30,10 +31,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 
-# bitsandbytes
-bitsandbytes==0.42.*; platform_system != "Windows"
-https://github.com/oobabooga/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.42.0-py3-none-win_amd64.whl; platform_system == "Windows"
-
 # llama-cpp-python (CPU only, AVX2)
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index f7ee0e233e..c7949ee2a1 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,5 +1,6 @@
 accelerate==0.27.*
 aqlm[gpu,cpu]==1.1.0
+bitsandbytes==0.43.*
 colorama
 datasets
 einops
@@ -30,10 +31,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 
-# bitsandbytes
-bitsandbytes==0.42.*; platform_system != "Windows"
-https://github.com/oobabooga/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.42.0-py3-none-win_amd64.whl; platform_system == "Windows"
-
 # llama-cpp-python (CPU only, no AVX2)
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

From 67b24b0b88de3dc2b99be267b3f9a640af49e702 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 09:07:27 -0700
Subject: [PATCH 12/17] Bump llama-cpp-python to 0.2.56

---
 requirements.txt                 | 24 ++++++++++++------------
 requirements_amd.txt             | 12 ++++++------
 requirements_amd_noavx2.txt      |  8 ++++----
 requirements_apple_intel.txt     | 12 ++++++------
 requirements_apple_silicon.txt   | 16 ++++++++--------
 requirements_cpu_only.txt        |  8 ++++----
 requirements_cpu_only_noavx2.txt |  8 ++++----
 requirements_noavx2.txt          | 24 ++++++++++++------------
 8 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9c4c9a6578..f66c016767 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 01da808525..b4cf3f46f2 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -31,14 +31,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.55+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.55+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.56+rocm5.6.1-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.56+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index fe56bf7981..d2936848ad 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index cb2254af1c..bf48e02340 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.14/exllamav2-0.0.14-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 2ad171218b..3298dfbdcc 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -31,12 +31,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.55-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.56-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
 https://github.com/oobabooga/exllamav2/releases/download/v0.0.14/exllamav2-0.0.14-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 25bf229232..0f4a734f8f 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 02bbf2c0a9..df470529c8 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index c7949ee2a1..5eeaca1d53 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -32,22 +32,22 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.55+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.56+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.55+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.56+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.55+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121avx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.56+cu121avx-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

From b3ade5832b6f79f1521e376aec9a31be0ea4f45f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 09:41:17 -0700
Subject: [PATCH 13/17] Keep AQLM only for Linux (fails to install on Windows)

---
 requirements.txt                 | 2 +-
 requirements_amd.txt             | 1 -
 requirements_amd_noavx2.txt      | 1 -
 requirements_apple_intel.txt     | 1 -
 requirements_apple_silicon.txt   | 1 -
 requirements_cpu_only.txt        | 1 -
 requirements_cpu_only_noavx2.txt | 1 -
 requirements_noavx2.txt          | 2 +-
 requirements_nowheels.txt        | 1 -
 9 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f66c016767..d608907a05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
+aqlm[gpu,cpu]==1.1.0; platform_system == "Linux"
 bitsandbytes==0.43.*
 colorama
 datasets
diff --git a/requirements_amd.txt b/requirements_amd.txt
index b4cf3f46f2..57bd8d64bd 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index d2936848ad..04f6a1a470 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index bf48e02340..7c451c8033 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 3298dfbdcc..1178b8b443 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 0f4a734f8f..beda6bd83c 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index df470529c8..3300564e70 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 5eeaca1d53..0c92c8d2ff 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,5 +1,5 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
+aqlm[gpu,cpu]==1.1.0; platform_system == "Linux"
 bitsandbytes==0.43.*
 colorama
 datasets
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index a717931500..7166015dd4 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,5 +1,4 @@
 accelerate==0.27.*
-aqlm[gpu,cpu]==1.1.0
 colorama
 datasets
 einops

From a102c704f5c65c62dfe27038d2946061be2c6ba5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 16:13:29 -0700
Subject: [PATCH 14/17] Add numba to requirements.txt

---
 requirements.txt                 | 1 +
 requirements_amd.txt             | 1 +
 requirements_amd_noavx2.txt      | 1 +
 requirements_apple_intel.txt     | 1 +
 requirements_apple_silicon.txt   | 1 +
 requirements_cpu_only.txt        | 1 +
 requirements_cpu_only_noavx2.txt | 1 +
 requirements_noavx2.txt          | 1 +
 requirements_nowheels.txt        | 1 +
 9 files changed, 9 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index d608907a05..8ab4e61b20 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 57bd8d64bd..f3045d56f5 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 04f6a1a470..7004e38809 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 7c451c8033..2a398f9174 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 1178b8b443..fc10bee812 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index beda6bd83c..05cad5cf18 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 3300564e70..4eeef9e9a5 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 0c92c8d2ff..295680fca9 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -9,6 +9,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 7166015dd4..abb9d45d73 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -7,6 +7,7 @@ hqq==0.1.5
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
+numba==0.59.*
 numpy==1.26.*
 optimum==1.17.*
 pandas

From abcdd0ad5bfacdb403e0a0fff4d101a421cb2446 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 16:15:52 -0700
Subject: [PATCH 15/17] API: don't use settings.yaml for default values

---
 extensions/openai/completions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 897d542ce5..3bc5170a09 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -250,13 +250,13 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False) -
     else:
         instruction_template_str = shared.settings['instruction_template_str']
 
-    chat_template_str = body['chat_template_str'] or shared.settings['chat_template_str']
-    chat_instruct_command = body['chat_instruct_command'] or shared.settings['chat-instruct_command']
+    chat_template_str = body['chat_template_str'] or shared.default_settings['chat_template_str']
+    chat_instruct_command = body['chat_instruct_command'] or shared.default_settings['chat-instruct_command']
 
     # Chat character
-    character = body['character'] or shared.settings['character']
+    character = body['character'] or shared.default_settings['character']
     character = "Assistant" if character == "None" else character
-    name1 = body['user_name'] or shared.settings['name1']
+    name1 = body['user_name'] or shared.default_settings['name1']
     name1, name2, _, greeting, context = load_character_memoized(character, name1, '')
     name2 = body['bot_name'] or name2
     context = body['context'] or context

From 15d90d9bd5ebc7fb55cadf35883c43bd9c57cb3b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 18:20:50 -0700
Subject: [PATCH 16/17] Minor logging change

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index e3b0e883f3..66f62e9123 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -294,7 +294,7 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             downloader.check_model_files(model, branch, links, sha256, output_folder)
             progress(1.0)
         else:
-            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
+            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}`")
             downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
 
             yield (f"Model successfully saved to `{output_folder}/`.")

From 056717923f953eab362f0937a83f8d1a612c4aab Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 10 Mar 2024 19:15:23 -0700
Subject: [PATCH 17/17] Document StreamingLLM

---
 README.md                | 3 +++
 docs/04 - Model Tab.md   | 9 +++++----
 modules/shared.py        | 4 ++--
 modules/ui_model_menu.py | 2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index bb5d2810f1..6b92448c11 100644
--- a/README.md
+++ b/README.md
@@ -269,6 +269,9 @@ List of command-line flags
 | `--logits_all`| Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower. |
 | `--no_offload_kqv` | Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
+| `--row_split`                               | Split the model by rows across GPUs. This may improve multi-gpu performance. |
+| `--streaming-llm`                           | Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. |
+| `--attention-sink-size ATTENTION_SINK_SIZE` | StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn't share a prefix with the old prompt. |
 
 #### ExLlamav2
 
diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
index 3766c96c9d..05b85b481b 100644
--- a/docs/04 - Model Tab.md	
+++ b/docs/04 - Model Tab.md	
@@ -80,16 +80,17 @@ Example: https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF
 
 * **n-gpu-layers**: The number of layers to allocate to the GPU. If set to 0, only the CPU will be used. If you want to offload all layers, you can simply set this to the maximum value.
 * **n_ctx**: Context length of the model. In llama.cpp, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on the metadata inside the GGUF file, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "n_ctx" so that you don't have to set the same thing twice.
+* **tensor_split**: For multi-gpu only. Sets the amount of memory to allocate per GPU as proportions. Not to be confused with other loaders where this is set in GB; here you can set something like `30,70` for 30%/70%.
+* **n_batch**: Batch size for prompt processing. Higher values are supposed to make generation faster, but I have never obtained any benefit from changing this value.
 * **threads**: Number of threads. Recommended value: your number of physical cores. 
 * **threads_batch**: Number of threads for batch processing. Recommended value: your total number of cores (physical + virtual).
-* **n_batch**: Batch size for prompt processing. Higher values are supposed to make generation faster, but I have never obtained any benefit from changing this value.
+* **tensorcores**: Use llama.cpp compiled with "tensor cores" support, which improves performance on NVIDIA RTX cards in most cases.
+* **streamingllm**: Experimental feature to avoid re-evaluating the entire prompt when part of it is removed, for instance, when you hit the context length for the model in chat mode and an old message is removed.
+* **cpu**: Force a version of llama.cpp compiled without GPU acceleration to be used. Can usually be ignored. Only set this if you want to use CPU only and llama.cpp doesn't work otherwise. 
 * **no_mul_mat_q**: Disable the mul_mat_q kernel. This kernel usually improves generation speed significantly. This option to disable it is included in case it doesn't work on some system.
 * **no-mmap**: Loads the model into memory at once, possibly preventing I/O operations later on at the cost of a longer load time.
 * **mlock**: Force the system to keep the model in RAM rather than swapping or compressing (no idea what this means, never used it).
 * **numa**: May improve performance on certain multi-cpu systems.
-* **cpu**: Force a version of llama.cpp compiled without GPU acceleration to be used. Can usually be ignored. Only set this if you want to use CPU only and llama.cpp doesn't work otherwise. 
-* **tensor_split**: For multi-gpu only. Sets the amount of memory to allocate per GPU.
-* **Seed**: The seed for the llama.cpp random number generator. Not very useful as it can only be set once (that I'm aware).
 
 ### llamacpp_HF
 
diff --git a/modules/shared.py b/modules/shared.py
index 8758cee1ca..69ad0cfd27 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -130,8 +130,8 @@
 group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
 group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-group.add_argument('--streaming-llm', action='store_true', help='Activates StreamingLLM, which prevents the prompt from ever being reevaluated when old chat messages are removed due to the context length for the model being reached.')
-group.add_argument('--attention-sink-size', type=int, default=5, help='Minimum attention sink length from StreamingLLM.')
+group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
+group.add_argument('--attention-sink-size', type=int, default=5, help='StreamingLLM: number of sink tokens. Only used if the trimmed prompt does not share a prefix with the old prompt.')
 
 # ExLlamaV2
 group = parser.add_argument_group('ExLlamaV2')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 66f62e9123..d268770a52 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -118,7 +118,7 @@ def create_ui():
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
-                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size)
+                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                             shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
                             shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')