diff --git a/README.md b/README.md
index fb22d09945..0e16aa30a5 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@ A Gradio web UI for Large Language Models.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
-|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_instruct.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_chat.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/AFTER-INSTRUCT.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/AFTER-CHAT.png) |
 |:---:|:---:|
-|![Image1](https://github.com/oobabooga/screenshots/raw/main/print_default.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/print_parameters.png) |
+|![Image1](https://github.com/oobabooga/screenshots/raw/main/AFTER-DEFAULT.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/AFTER-PARAMETERS.png) |
 
 ## Features
 
@@ -202,18 +202,19 @@ List of command-line flags
 
 ```txt
 usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
-                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--chat-buttons] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices]
-                 [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code]
-                 [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE]
-                 [--flash-attn] [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock]
-                 [--n-gpu-layers N_GPU_LAYERS] [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm]
-                 [--attention-sink-size ATTENTION_SINK_SIZE] [--tokenizer-dir TOKENIZER_DIR] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn]
-                 [--no_xformers] [--no_sdpa] [--cache_8bit] [--cache_4bit] [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act]
-                 [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
+                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]]
+                 [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast]
+                 [--use_flash_attention_2] [--use_eager_attention] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--tensorcores]
+                 [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
+                 [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
+                 [--tokenizer-dir TOKENIZER_DIR] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
+                 [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16] [--desc_act] [--disable_exllama] [--disable_exllamav2]
+                 [--wbits WBITS] [--groupsize GROUPSIZE] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
                  [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
                  [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
-                 [--subpath SUBPATH] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
+                 [--subpath SUBPATH] [--old-colors] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--nowebui]
                  [--multimodal-pipeline MULTIMODAL_PIPELINE] [--model_type MODEL_TYPE] [--pre_layer PRE_LAYER [PRE_LAYER ...]] [--checkpoint CHECKPOINT] [--monkey-patch] [--no_inject_fused_attention]
+                 [--cache_4bit] [--cache_8bit] [--chat-buttons]
 
 Text generation web UI
 
@@ -232,7 +233,6 @@ Basic settings:
                                                  file will be loaded by default without the need to use the --settings flag.
   --extensions EXTENSIONS [EXTENSIONS ...]       The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
   --verbose                                      Print the prompts to the terminal.
-  --chat-buttons                                 Show buttons on the chat tab instead of a hover menu.
   --idle-timeout IDLE_TIMEOUT                    Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
@@ -291,9 +291,8 @@ ExLlamaV2:
   --no_flash_attn                                Force flash-attention to not be used.
   --no_xformers                                  Force xformers to not be used.
   --no_sdpa                                      Force Torch SDPA to not be used.
-  --cache_8bit                                   Use 8-bit cache to save VRAM.
-  --cache_4bit                                   Use Q4 cache to save VRAM.
   --num_experts_per_token NUM_EXPERTS_PER_TOKEN  Number of experts to use for generation. Applies to MoE models like Mixtral.
+  --enable_tp                                    Enable Tensor Parallelism (TP) in ExLlamaV2.
 
 AutoGPTQ:
   --triton                                       Use triton.
@@ -311,6 +310,9 @@ HQQ:
 TensorRT-LLM:
   --cpp-runner                                   Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
+Cache:
+  --cache_type CACHE_TYPE                        KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
+
 DeepSpeed:
   --deepspeed                                    Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
   --nvme-offload-dir NVME_OFFLOAD_DIR            DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@@ -332,6 +334,7 @@ Gradio:
   --ssl-keyfile SSL_KEYFILE                      The path to the SSL certificate key file.
   --ssl-certfile SSL_CERTFILE                    The path to the SSL certificate cert file.
   --subpath SUBPATH                              Customize the subpath for gradio, use with reverse proxy
+  --old-colors                                   Use the legacy Gradio colors, before the December/2024 update.
 
 API:
   --api                                          Enable the API extension.
diff --git a/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf b/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf
new file mode 100644
index 0000000000..43ed4f5ee6
Binary files /dev/null and b/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf differ
diff --git a/css/Inter/Inter-VariableFont_opsz,wght.ttf b/css/Inter/Inter-VariableFont_opsz,wght.ttf
new file mode 100644
index 0000000000..e31b51e3e9
Binary files /dev/null and b/css/Inter/Inter-VariableFont_opsz,wght.ttf differ
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index d626dbb1c8..854fff607c 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -16,6 +16,6 @@
 }
 
 .message {
-    padding-bottom: 30px;
+    padding-bottom: 2em;
     grid-template-columns: 70px minmax(0, 1fr);
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 618184cfab..d7b1ba88e4 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -1,7 +1,7 @@
 .message {
     display: grid;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 15px;
+    padding-bottom: 2em;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 22.5px !important;
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 50b9402f4d..f6ceb93245 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -1,74 +1,101 @@
 .chat {
     background: transparent;
-    padding: 24px 19px;
-    padding-right: 19px !important;
+    padding: 0;
     padding-top: 0;
 }
 
-.chat > .messages {
-    padding-top: 18px !important;
+.chat > .messages:first-child {
+    padding-top: 0 !important;
 }
 
-.message {
-    display: grid;
-    grid-template-columns: 60px 1fr;
-    padding-bottom: 25px;
-    font-size: 15px;
-    font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
-    line-height: 24px;
+.chat > .messages > :last-child {
+    margin-bottom: 1.7rem !important;
 }
 
-.message:first-child {
-    padding-top: 0;
+.chat .message-body p, .chat .message-body li {
+    font-size: 1rem !important;
+    line-height: 28px !important;
 }
 
-.username {
-    display: none;
+.dark .chat .message-body p,
+.dark .chat .message-body li,
+.dark .chat .message-body q {
+    color: #d1d5db !important;
 }
 
-.message-body p, .message-body li {
-    font-size: 15px !important;
-    line-height: 24px !important;
+.chat .message-body p,
+.chat .message-body ul,
+.chat .message-body ol {
+    margin-top: 1.25em !important;
+    margin-bottom: 1.25em !important;
 }
 
-.message-body p, .chat .message-body ul, .chat .message-body ol {
-    margin-bottom: 16px !important;
+.chat .message-body p:first-child,
+.chat .message-body ul:first-child,
+.chat .message-body ol:first-child {
+    margin-top: 0 !important;
 }
 
-.message-body p:last-child, .chat .message-body ul:last-child, .chat .message-body ol:last-child {
+.chat .message-body p:last-child,
+.chat .message-body ul:last-child,
+.chat .message-body ol:last-child {
     margin-bottom: 0 !important;
 }
 
-.gradio-container .chat .assistant-message {
-    padding: 20px;
+.chat .message-body li {
+    margin-top: 1.25em !important;
+    margin-bottom: 1.25em !important;
+}
+
+.user-message, .assistant-message {
+    font-family: Inter, Helvetica, Arial, sans-serif;
+}
+
+.message:first-child {
+    padding-top: 0;
+}
+
+.username {
+    display: none;
+}
+
+.chat .user-message {
+    padding: 1.5rem 1rem;
+    border-radius: 0;
+    border-bottom-right-radius: 0;
+}
+
+.chat .assistant-message {
     background: #f4f4f4;
-    margin-top: 9px !important;
-    margin-bottom: 12px !important;
-    border-radius: 7px;
-    border: 1px solid var(--border-color-primary);
+    padding: 1.5rem 1rem;
+    border-radius: 0;
+    border: 0;
+}
+
+.dark .chat .user-message {
+    background: transparent;
 }
 
 .dark .chat .assistant-message {
-    background: var(--color-grey-800);
+    background: var(--light-gray);
 }
 
-.gradio-container .chat .user-message {
-    padding: 20px;
-    padding-left: 0;
-    padding-right: 0;
-    background-color: transparent;
-    border-radius: 8px;
-    border-bottom-right-radius: 0;
+.chat .user-message .text,
+.chat .assistant-message .text {
+    max-width: 40.25rem;
+    margin-left: auto;
+    margin-right: auto;
 }
 
-.gradio-container .chat .assistant-message:last-child, .gradio-container .chat .user-message:last-child {
-    margin-bottom: 0 !important;
+/* Create space between two assistant messages in a row */
+.assistant-message + .assistant-message {
+    margin-top: 1.5rem;
 }
 
-code {
+pre > code {
     background-color: #f3f4f6 !important;
 }
 
-.dark code {
+.dark pre > code {
     background-color: #1f2937 !important;
 }
diff --git a/css/main.css b/css/main.css
index cf3babdba6..fef3d3f1bf 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1,7 +1,46 @@
+:root {
+    --darker-gray: #202123;
+    --dark-gray: #343541;
+    --light-gray: #444654;
+    --light-theme-gray: #f4f4f4;
+    --border-color-dark: #525252;
+    --header-width: 112px;
+    --selected-item-color-dark: #32333e;
+}
+
+@font-face {
+    font-family: Inter;
+    src: url('file/css/Inter/Inter-VariableFont_opsz,wght.ttf') format('truetype');
+    font-weight: 100 900;
+    font-style: normal;
+}
+
+@font-face {
+    font-family: Inter;
+    src: url('file/css/Inter/Inter-Italic-VariableFont_opsz,wght.ttf') format('truetype');
+    font-weight: 100 900;
+    font-style: italic;
+}
+
 .tabs.svelte-710i53 {
     margin-top: 0
 }
 
+.padded.svelte-12cmxck {
+    padding: 3px 0;
+}
+
+div.svelte-sfqy0y,
+div.svelte-iyf88w {
+    background: transparent;
+    border: 0;
+}
+
+/* "info" messages without a title above */
+.block > .svelte-e8n7p6:not(:only-of-type, #chat-mode *) {
+    margin-bottom: 2px;
+}
+
 .py-6 {
     padding-top: 2.5rem
 }
@@ -19,7 +58,7 @@
     height: 39.594px;
     align-self: end;
     line-height: 1em;
-    border-radius: 0.5em;
+    border-radius: 0.375rem;
     flex: none;
 }
 
@@ -46,10 +85,6 @@
     min-height: 0
 }
 
-.dark svg {
-    fill: white;
-}
-
 .dark a {
     color: white !important;
 }
@@ -62,14 +97,20 @@ ol li p, ul li p {
     border: 0;
 }
 
+#default-tab, #notebook-tab, #parameters, #chat-settings, #lora, #training-tab, #model-tab, #session-tab {
+    padding: 1rem;
+}
+
 .gradio-container {
     max-width: 100% !important;
     padding-top: 0 !important;
 }
 
 #extensions {
-    margin-top: 5px;
-    margin-bottom: 35px;
+    margin: 5px auto 35px;
+    max-width: 880px;
+    padding: 1em;
+    padding-left: calc(var(--header-width) + 1em);
 }
 
 .extension-tab {
@@ -86,20 +127,29 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 }
 
 gradio-app > :first-child {
-    padding-left: var(--size-4) !important;
-    padding-right: var(--size-4) !important;
+    padding: 0 !important;
 }
 
 .header_bar {
-    background-color: #f4f4f4;
     box-shadow: 0 0 3px rgba(22 22 22 / 35%);
     margin-bottom: 0;
     overflow-x: scroll;
-    margin-left: calc(-1 * var(--size-4));
-    margin-right: calc(-1 * var(--size-4));
-    display: block !important;
     text-wrap: nowrap;
     z-index: 90;
+    position: fixed;
+    display: flex !important;
+    flex-direction: column;
+    height: 100dvh;
+    width: var(--header-width);
+}
+
+.header_bar button {
+    margin: 0;
+    padding: 0.75rem;
+}
+
+.header_bar button.selected {
+    border: 0;
 }
 
 .dark .header_bar {
@@ -113,23 +163,23 @@ gradio-app > :first-child {
 }
 
 .textbox_default textarea {
-    height: calc(100dvh - 271px);
+    height: calc(100dvh - 201px);
 }
 
 .textbox_default_output textarea {
-    height: calc(100dvh - 185px);
+    height: calc(100dvh - 117px);
 }
 
 .textbox textarea {
-    height: calc(100dvh - 241px);
+    height: calc(100dvh - 172px);
 }
 
 .textbox_logits textarea {
-    height: calc(100dvh - 236px);
+    height: calc(100dvh - 205px);
 }
 
 .textbox_logits_notebook textarea {
-    height: calc(100dvh - 292px);
+    height: calc(100dvh - 221px);
 }
 
 .monospace textarea {
@@ -149,24 +199,6 @@ gradio-app > :first-child {
     color: #efefef !important;
 }
 
-@media screen and (width <= 711px) {
-    .textbox_default textarea {
-        height: calc(100dvh - 259px);
-    }
-
-    div .default-token-counter {
-        top: calc( 0.5 * (100dvh - 236px) ) !important;
-    }
-
-    .transparent-substring {
-        display: none;
-    }
-
-    .hover-menu {
-        min-width: 250px !important;
-    }
-}
-
 /* Hide the gradio footer */
 footer {
     display: none !important;
@@ -227,11 +259,13 @@ button {
 .pretty_scrollbar::-webkit-scrollbar-thumb,
 .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
     background: var(--neutral-300);
+    border-radius: 30px;
 }
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: var(--neutral-700);
+    background: #ccc;
+    border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-resizer {
@@ -239,7 +273,8 @@ button {
 }
 
 .dark .pretty_scrollbar::-webkit-resizer {
-    background: #374151;
+    background: #ccc;
+    border-radius: 10px;
 }
 
 .pretty_scrollbar::-webkit-scrollbar-corner {
@@ -251,20 +286,26 @@ audio {
 }
 
 /* Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui */
-.token-counter {
+#default-token-counter, #notebook-token-counter {
     position: absolute !important;
-    top: calc( 0.5 * (100dvh - 218px) ) !important;
-    right: 2px;
     z-index: 100;
     background: var(--input-background-fill) !important;
     min-height: 0 !important;
+    width: 0;
+    text-align: left;
+    direction: rtl;
+    right: 5px;
+}
+
+#default-token-counter {
+    top: calc(100dvh - 200px) !important;
 }
 
-.default-token-counter {
-    top: calc( 0.5 * (100dvh - 248px) ) !important;
+#notebook-token-counter {
+    top: calc(100dvh - 171px) !important;
 }
 
-.token-counter span {
+#default-token-counter span, #notebook-token-counter span {
     padding: 1px;
     box-shadow: 0 0 0 0.3em rgb(192 192 192 / 15%), inset 0 0 0.6em rgb(192 192 192 / 7.5%);
     border: 2px solid rgb(192 192 192 / 40%) !important;
@@ -272,15 +313,15 @@ audio {
 }
 
 .no-background {
-    background: var(--background-fill-primary) !important;
+    background: transparent;
     padding: 0 !important;
 }
 
 /* ----------------------------------------------
   Chat tab
 ---------------------------------------------- */
-.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
-    height: 66.67vh
+.h-\[40dvh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx {
+    height: 66.67dvh
 }
 
 .gradio-container {
@@ -310,7 +351,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-tab {
-    padding-top: 0;
+    padding: 0;
+}
+
+#chat-tab > :nth-child(1) {
+    display: flex;
+    flex-direction: row;
+    gap: 0;
 }
 
 #chat-tab button#Generate, #chat-tab button#stop {
@@ -322,7 +369,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-tab > :first-child, #extensions {
-    max-width: 880px;
     margin-left: auto;
     margin-right: auto;
 }
@@ -342,61 +388,49 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .chat {
     margin-left: auto;
     margin-right: auto;
-    max-width: 880px;
     min-height: var(--chat-height);
     overflow-y: auto;
-    padding-right: 15px;
     display: flex;
     flex-direction: column;
     word-break: break-word;
     overflow-wrap: anywhere;
     border-top: none;
-    border-radius: 0 0 0 8px;
+    border-radius: 0;
     visibility: visible;
 }
 
 .chat-parent {
-    height: calc(100dvh - 98px - var(--header-height) - var(--input-delta));
+    height: calc(100dvh - 98px - var(--input-delta));
     overflow: auto !important;
     border-radius: 0 !important;
     margin-bottom: var(--input-delta) !important;
 }
 
-/* On desktop, automatically hide the chat scroll bar
- * when not hovered. */
-@media (hover: hover) and (pointer: fine) {
-    .chat-parent {
-        visibility: hidden;
-    }
-
-    .chat-parent:focus, .chat-parent:hover {
-        visibility: visible;
-    }
-}
-
 .chat-parent .prose {
     visibility: visible;
 }
 
-.old-ui .chat-parent {
-    height: calc(100dvh - 192px - var(--header-height) - var(--input-delta));
-    margin-bottom: var(--input-delta) !important;
+.chat .message {
+    width: min(100%, 48rem);
+    margin-left: auto;
+    margin-right: auto;
+    text-align: start;
+    padding-left: 1rem;
+    padding-right: 1rem;
 }
 
 .chat-parent.bigchat {
-    height: calc(100dvh - 98px - var(--header-height) - var(--input-delta)) !important;
+    height: calc(100dvh - 98px - var(--input-delta)) !important;
     margin-bottom: var(--input-delta) !important;
 }
 
 .chat > .messages {
     display: flex;
     flex-direction: column;
-    padding-top: 25px;
 }
 
-.chat .message:last-child {
-    margin-bottom: 0 !important;
-    padding-bottom: 15px !important;
+.chat > .messages > :first-child {
+    padding-top: 20px;
 }
 
 .message-body h1,
@@ -404,7 +438,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .message-body h3,
 .message-body h4 {
     color: var(--body-text-color);
-    margin: 20px 0 10px 0;
+    margin: 20px 0 10px;
 }
 
 .dark .message q {
@@ -423,12 +457,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding-inline-start: 2em;
 }
 
-.message-body li:not(:last-child) {
-    margin-top: 0 !important;
-    margin-bottom: 2px !important;
+.chat .message-body li:not(:last-child) {
+    margin-top: 0;
+    margin-bottom: 2px;
 }
 
-.message-body li:last-child {
+.chat .message-body li:last-child {
     margin-bottom: 0 !important;
 }
 
@@ -456,7 +490,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     overflow: scroll;
 }
 
-.message-body code {
+.prose ul ul {
+    margin: 0;
+}
+
+.message-body pre > code {
     white-space: pre-wrap !important;
     word-wrap: break-word !important;
     border: 1px solid #666;
@@ -467,7 +505,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     color: #1f2328;
 }
 
-.dark .message-body code {
+.dark .message-body pre > code {
     background: #0d1117 !important;
     color: rgb(201 209 217);
 }
@@ -477,8 +515,18 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding: 15px;
 }
 
+.message-body :not(pre) > code::before {
+    content: "`";
+}
+
+.message-body :not(pre) > code::after {
+    content: "`";
+}
+
 .message-body :not(pre) > code {
     white-space: normal !important;
+    font-weight: bold;
+    font-family: unset;
 }
 
 #chat-input {
@@ -488,6 +536,15 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border: none;
 }
 
+#chat-input textarea {
+    padding: 0.65rem 2.5rem;
+}
+
+#chat-input textarea::placeholder {
+    white-space: nowrap;
+    overflow: hidden;
+}
+
 #chat-input textarea:focus {
     box-shadow: none !important;
 }
@@ -500,6 +557,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     display: none;
 }
 
+.chat-input-positioned {
+    position: absolute;
+    bottom: 0;
+    max-width: 54rem;
+    left: 50%;
+    transform: translateX(-50%);
+}
+
 @media print {
     body {
         visibility: hidden;
@@ -535,7 +600,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #show-controls {
     position: absolute;
-    height: 100%;
     background-color: transparent;
     border: 0 !important;
     border-radius: 0;
@@ -544,7 +608,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #show-controls label {
     z-index: 1000;
     position: absolute;
-    right: 0;
+    right: 30px;
+    top: 10px;
     white-space: nowrap;
     overflow: hidden;
     text-overflow: ellipsis;
@@ -626,7 +691,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: absolute;
     bottom: 80%;
     left: 0;
-    background-color: var(--background-fill-primary);
     box-shadow: 0 0 5px rgb(0 0 0 / 25%);
     z-index: 10000;
     min-width: 330px;
@@ -637,7 +701,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     width: 100%;
     background: transparent !important;
     border-radius: 0 !important;
-    border-color: var(--border-color-primary);
     justify-content: space-between;
     margin: 0 !important;
     height: 36px;
@@ -659,7 +722,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     opacity: 0.333;
 }
 
-#chat-tab:not(.old-ui) #chat-buttons {
+#chat-tab #chat-buttons {
     display: none !important;
 }
 
@@ -690,23 +753,37 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input-row {
-    padding-bottom: 20px;
+    padding-bottom: 1.5em;
+    padding-left: 1rem;
+    padding-right: 1rem;
 }
 
-.old-ui #chat-input-row, #chat-input-row.bigchat {
-    padding-bottom: 0 !important;
+#chat-input-row.bigchat {
+    padding-bottom: 1px !important;
 }
 
 #chat-col {
     padding-bottom: 100px;
 }
 
-.old-ui #chat-col, #chat-col.bigchat {
-    padding-bottom: 80px !important;
+@media screen and (width <= 924px) {
+    #chat-col {
+        padding-bottom: 100px;
+        margin-top: 32px;
+        position: relative; /* Ensure positioning for the pseudo-element */
+    }
+
+    .chat-parent {
+        height: calc(100dvh - 98px - var(--input-delta) - 32px);
+    }
+
+    .chat-parent.bigchat {
+        height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
+    }
 }
 
-.old-ui #chat-buttons #clear-history-confirm {
-    order: -1;
+#chat-col.bigchat {
+    padding-bottom: 80px !important;
 }
 
 .chat ol, .chat ul {
@@ -721,26 +798,37 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 /* ----------------------------------------------
-  Past chat histories in a side bar on desktop
+  Create the sidebars
 ---------------------------------------------- */
-@media screen and (width >= 1327px) {
-    #past-chats-row {
-        position: absolute;
-        top: 36px;
-        left: 0;
-        width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2));
-        max-width: 300px;
-        margin-left: calc(-0.5*(var(--document-width) - 880px - 14px - 16px * 2));
-    }
+#chat-controls,
+#past-chats-row {
+    width: 260px;
+    max-width: 80vw;
+    padding: 0.5rem;
+    height: 100dvh;
+    flex-shrink: 0;
+    box-sizing: content-box;
+    z-index: 10;
+}
 
-    #chat-controls {
-        position: absolute;
-        top: 16px;
-        right: 0;
-        width: calc(0.5*(var(--document-width) - 880px - 120px - 16px*2));
-        max-width: 400px;
-        margin-right: calc(-0.5*(var(--document-width) - 880px - 14px - 16px * 2));
-    }
+#past-chats-row:not(.negative-header) {
+    max-width: calc(85vw - var(--header-width));
+}
+
+#chat-controls {
+    padding: 1rem;
+    padding-bottom: 0;
+    overflow-y: scroll;
+}
+
+#chat-controls > :nth-child(1) {
+    padding: 0.5rem;
+}
+
+#past-chats-row + * {
+    width: unset;
+    flex-grow: 1;
+    flex-shrink: 1;
 }
 
 /* ----------------------------------------------
@@ -748,6 +836,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 ---------------------------------------------- */
 .options {
     z-index: 100 !important;
+    border: 1px solid var(--input-border-color);
+    border-radius: 0;
 }
 
 /* ----------------------------------------------
@@ -757,12 +847,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: fixed;
     bottom: 0;
     left: 0;
-    width: calc((100vw - 880px - 120px) /2);
+    width: calc(100vw / 2 - 600px);
+    z-index: 10000;
 }
 
 .pfp_character {
     position: relative;
-    z-index: 100;
 }
 
 .pfp_character:hover {
@@ -776,10 +866,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #past-chats {
-    max-height: calc(100vh - 195px);
+    max-height: calc(100dvh - 90px);
     overflow-y: scroll !important;
     border-radius: 0;
-    scrollbar-width: none; /* Hide scrollbar in Firefox by default */
+    scrollbar-width: auto;
+}
+
+#past-chats::-webkit-scrollbar {
+    display: block;
 }
 
 #past-chats label {
@@ -790,6 +884,24 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border-radius: 0;
     padding-top: 8px;
     padding-bottom: 8px;
+    position: relative;
+    min-height: 42px !important;
+}
+
+#past-chats label::before {
+    content: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9h8"></path><path d="M8 13h6"></path><path d="M18 4a3 3 0 0 1 3 3v8a3 3 0 0 1 -3 3h-5l-5 3v-3h-2a3 3 0 0 1 -3 -3v-8a3 3 0 0 1 3 -3h12z"></path></svg>');
+    position: absolute;
+    top: 12px;
+    left: 12px;
+    margin-right: 8px;
+}
+
+.dark #past-chats label::before {
+    content: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9h8"></path><path d="M8 13h6"></path><path d="M18 4a3 3 0 0 1 3 3v8a3 3 0 0 1 -3 3h-5l-5 3v-3h-2a3 3 0 0 1 -3 -3v-8a3 3 0 0 1 3 -3h12z"></path></svg>');
+}
+
+#past-chats label span {
+    margin-left: 29px;
 }
 
 #past-chats > :nth-child(2) {
@@ -797,23 +909,260 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #past-chats > :nth-child(3) {
-    gap: 0;
+    gap: 0.25rem;
 }
 
-#past-chats::-webkit-scrollbar {
+#past-chats input {
     display: none;
 }
 
-#past-chats:hover {
-    scrollbar-width: auto;
+#past-chats label {
+    padding: 0.75rem;
+    font-size: 12.5px;
+    font-weight: 400;
 }
 
-#past-chats:hover::-webkit-scrollbar {
-    display: block;
+#past-chats .selected,
+#past-chats label:hover {
+    border-radius: 0.5rem;
+}
+
+#past-chats label:hover {
+    cursor: pointer;
+}
+
+#past-chats-buttons,
+#delete-chat-row,
+#rename-row {
+    width: 100%;
+    justify-content: center;
+}
+
+
+#past-chats-row,
+#chat-controls {
+    width: 260px;
+    padding: 0.5rem;
+    height: calc(100dvh - 16px);
+    flex-shrink: 0;
+    box-sizing: content-box;
 }
 
-@media screen and (width < 1327px) {
-    #past-chats {
-        max-height: 300px;
+.sidebar-hidden {
+    width: 0 !important;
+    padding: 0 !important;
+    overflow: hidden;
+}
+
+#past-chats-toggle,
+#chat-controls-toggle,
+#navigation-toggle {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    cursor: pointer;
+    user-select: none;
+    border-radius: 3px;
+    z-index: 1000;
+    position: fixed;
+    width: 2rem;
+    height: 2rem;
+    top: 0;
+}
+
+#past-chats-toggle svg,
+#chat-controls-toggle svg,
+#navigation-toggle svg {
+    pointer-events: none;
+}
+
+@media screen and (width <= 408px) {
+    #past-chats-toggle.past-chats-open {
+        top: 28px;
+    }
+
+    #chat-controls-toggle.chat-controls-open {
+        top: 28px;
+        right: calc(16px + min(260px, 80vw)) !important;
+    }
+}
+
+#past-chats-toggle.past-chats-open.negative-header {
+    left: calc(min(260px, 85vw) + 16px);
+}
+
+#past-chats-toggle.past-chats-open:not(.negative-header) {
+    left: calc(112px + min(260px, calc(85vw - var(--header-width))) + 16px);
+}
+
+#past-chats-toggle.past-chats-closed:not(.negative-header) {
+    left: 112px;
+}
+
+#past-chats-toggle.past-chats-closed.negative-header {
+    left: 0;
+    top: 28px;
+}
+
+@media screen and (width <= 924px) {
+    #past-chats-toggle.past-chats-closed.negative-header {
+        left: 28px;
+        top: 0;
+    }
+}
+
+.header_bar ~ * {
+    margin-left: var(--header-width);
+}
+
+/* Positions for chat-controls-toggle */
+#chat-controls-toggle.chat-controls-open {
+    right: calc(min(260px, 80vw) + 23px);
+}
+
+#chat-controls-toggle.chat-controls-closed {
+    right: 7px;
+}
+
+@media screen and (width <= 924px) {
+    #chat-controls.sidebar-shown {
+        position: fixed;
+        right: 0;
+    }
+
+    #past-chats-row.sidebar-shown {
+        position: fixed;
+    }
+}
+
+/* ----------------------------------------------
+  Dark theme
+---------------------------------------------- */
+.dark .header_bar {
+    background-color: var(--darker-gray) !important;
+}
+
+.dark .header_bar button.selected {
+    background: var(--selected-item-color-dark);
+}
+
+.dark #chat-input textarea {
+    background: var(--light-gray);
+    color: white !important;
+    border-color: #292c3b;
+}
+
+.dark #chat-input textarea::placeholder {
+    color: #9ca3af;
+}
+
+.dark .hover-menu {
+    background-color: var(--darker-gray);
+}
+
+.dark .hover-menu button {
+    border-color: var(--border-color-primary);
+}
+
+.dark #chat-controls,
+.dark #past-chats-row {
+    background-color: var(--darker-gray);
+    border: 0 !important;
+}
+
+.dark #past-chats .selected,
+.dark #past-chats label:hover {
+    background-color: var(--selected-item-color-dark) !important;
+}
+
+.dark #past-chats-row,
+.dark #chat-controls {
+    background-color: var(--darker-gray);
+}
+
+.dark #past-chats-toggle,
+.dark #chat-controls-toggle,
+.dark #navigation-toggle {
+    color: white;
+}
+
+.dark svg {
+    fill: white;
+    color: white;
+}
+
+@media screen and (width <= 408px) {
+    .dark #past-chats-toggle.past-chats-open {
+        background: var(--darker-gray);
+    }
+
+    .dark #chat-controls-toggle.chat-controls-open {
+        background: var(--darker-gray);
+    }
+}
+
+/* ----------------------------------------------
+  Light theme
+---------------------------------------------- */
+.header_bar {
+    background-color: var(--light-theme-gray) !important;
+}
+
+.header_bar button.selected {
+    background: white;
+}
+
+#chat-controls,
+#past-chats-row {
+    background-color: var(--light-theme-gray);
+}
+
+#chat-controls {
+    border-left: 1px solid #d9d9d0;
+}
+
+#past-chats-row {
+    border-right: 1px solid #d9d9d0;
+}
+
+#past-chats-toggle,
+#chat-controls-toggle,
+#navigation-toggle {
+    color: gray !important;
+}
+
+.mobile-top-bar {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 32px;
+    z-index: 2;
+    opacity: 0;
+    pointer-events: none;
+}
+
+@media screen and (width <= 924px) {
+    .mobile-top-bar {
+        opacity: 1;
+        pointer-events: auto;
+    }
+
+    .dark .mobile-top-bar {
+        background-color: var(--darker-gray);
+    }
+
+    .mobile-top-bar {
+        background-color: var(--light-theme-gray);
+    }
+}
+
+@media screen and (width <= 408px) {
+    #past-chats-toggle.past-chats-open {
+        background: var(--light-theme-gray);
+    }
+
+    #chat-controls-toggle.chat-controls-open {
+        background: var(--light-theme-gray);
     }
 }
diff --git a/extensions/coqui_tts/requirements.txt b/extensions/coqui_tts/requirements.txt
index 747f99a068..b0b691e8fe 100644
--- a/extensions/coqui_tts/requirements.txt
+++ b/extensions/coqui_tts/requirements.txt
@@ -1 +1 @@
-TTS==0.21.*
\ No newline at end of file
+coqui-tts==0.25.1
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 6bd8f409a0..2cefc22bde 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -143,21 +143,20 @@ def convert_history(history):
         new_history = []
         for entry in history:
             if isinstance(entry['content'], list):
-                image_url = None
-                content = None
                 for item in entry['content']:
                     if not isinstance(item, dict):
                         continue
-
+                    
+                    image_url = None
+                    content = None
                     if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
                         image_url = item['image_url']['url']
                     elif item['type'] == 'text' and isinstance(item['text'], str):
                         content = item['text']
-
-                if image_url:
-                    new_history.append({"image_url": image_url, "role": "user"})
-                if content:
-                    new_history.append({"content": content, "role": "user"})
+                    if image_url:
+                        new_history.append({"image_url": image_url, "role": "user"})
+                    if content:
+                        new_history.append({"content": content, "role": "user"})
             else:
                 new_history.append(entry)
 
diff --git a/js/main.js b/js/main.js
index 3028afac1f..a8018175db 100644
--- a/js/main.js
+++ b/js/main.js
@@ -18,16 +18,18 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
   if (extensionsVisible) {
     if (extensions) {
       extensions.style.display = "flex";
-      extensions.style.maxWidth = chatVisible ? "880px" : "none";
-      extensions.style.padding = chatVisible ? "0px" : "15px";
     }
+
     this.style.marginBottom = chatVisible ? "0px" : "19px";
 
     if (chatVisible && !showControlsChecked) {
-      document.querySelectorAll("#chat-tab > div > :nth-child(n+2), #extensions").forEach(element => {
+      document.querySelectorAll(
+        "#chat-tab > div > :nth-child(1), #chat-tab > div > :nth-child(3), #chat-tab > div > :nth-child(4), #extensions"
+      ).forEach(element => {
         element.style.display = "none";
       });
     }
+
   } else {
     this.style.marginBottom = "19px";
     if (extensions) extensions.style.display = "none";
@@ -132,8 +134,7 @@ targetElement.addEventListener("scroll", function() {
 const observer = new MutationObserver(function(mutations) {
   updateCssProperties();
 
-  const firstChild = targetElement.children[0];
-  if (firstChild.classList.contains("generating")) {
+  if (targetElement.classList.contains("_generating")) {
     typing.parentNode.classList.add("visible-dots");
     document.getElementById("stop").style.display = "flex";
     document.getElementById("Generate").style.display = "none";
@@ -255,7 +256,7 @@ for (i = 0; i < slimDropdownElements.length; i++) {
 // The show/hide events were adapted from:
 // https://github.com/SillyTavern/SillyTavern/blob/6c8bd06308c69d51e2eb174541792a870a83d2d6/public/script.js
 //------------------------------------------------
-var buttonsInChat = document.querySelectorAll("#chat-tab:not(.old-ui) #chat-buttons button");
+var buttonsInChat = document.querySelectorAll("#chat-tab #chat-buttons button");
 var button = document.getElementById("hover-element-button");
 var menu = document.getElementById("hover-menu");
 var istouchscreen = (navigator.maxTouchPoints > 0) || "ontouchstart" in document.documentElement;
@@ -290,12 +291,6 @@ if (buttonsInChat.length > 0) {
       thisButton.innerHTML = newText;
     }
   }
-} else {
-  buttonsInChat = document.querySelectorAll("#chat-tab.old-ui #chat-buttons button");
-  for (let i = 0; i < buttonsInChat.length; i++) {
-    buttonsInChat[i].textContent = buttonsInChat[i].textContent.replace(/ \(.*?\)/, "");
-  }
-  document.getElementById("gr-hover-container").style.display = "none";
 }
 
 function isMouseOverButtonOrMenu() {
@@ -339,6 +334,8 @@ menu.addEventListener("mouseleave", function () {
 
 // Add event listener for click anywhere in the document
 document.addEventListener("click", function (event) {
+  const target = event.target;
+
   // Check if the click is outside the button/menu and the menu is visible
   if (!isMouseOverButtonOrMenu() && menu.style.display === "flex") {
     hideMenu();
@@ -347,6 +344,21 @@ document.addEventListener("click", function (event) {
   if (event.target.classList.contains("pfp_character")) {
     toggleBigPicture();
   }
+
+  // Handle sidebar clicks on mobile
+  if (isMobile()) {
+  // Check if the click did NOT originate from any of the specified toggle buttons or elements
+    if (
+      target.closest("#navigation-toggle") !== navigationToggle &&
+    target.closest("#past-chats-toggle") !== pastChatsToggle &&
+    target.closest("#chat-controls-toggle") !== chatControlsToggle &&
+    target.closest(".header_bar") !== headerBar &&
+    target.closest("#past-chats-row") !== pastChatsRow &&
+    target.closest("#chat-controls") !== chatControlsRow
+    ) {
+      handleIndividualSidebarClose(event);
+    }
+  }
 });
 
 //------------------------------------------------
@@ -361,10 +373,9 @@ for (var i = 0; i < 2; i++) {
 parent.insertBefore(elementToMove, parent.firstChild);
 
 //------------------------------------------------
-// Make the chat input grow upwards instead of downwards
+// Position the chat input
 //------------------------------------------------
-document.getElementById("show-controls").parentNode.style.position = "absolute";
-document.getElementById("show-controls").parentNode.style.bottom = "0px";
+document.getElementById("show-controls").parentNode.classList.add("chat-input-positioned");
 
 //------------------------------------------------
 // Focus on the chat input
@@ -444,20 +455,10 @@ function updateCssProperties() {
 
   // Check if the chat container is visible
   if (chatContainer.clientHeight > 0) {
-    var numericHeight = chatContainer.parentNode.clientHeight - chatInputHeight + 40 - 100;
-    if (document.getElementById("chat-tab").style.paddingBottom != "") {
-      numericHeight += 20;
-    }
-
-    const newChatHeight = `${numericHeight}px`;
+    const newChatHeight = `${chatContainer.parentNode.clientHeight - chatInputHeight + 40 - 100 - 20}px`;
     document.documentElement.style.setProperty("--chat-height", newChatHeight);
     document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
 
-    // Get and set header height
-    const header = document.querySelector(".header_bar");
-    const headerHeight = `${header.clientHeight}px`;
-    document.documentElement.style.setProperty("--header-height", headerHeight);
-
     // Adjust scrollTop based on input height change
     if (chatInputHeight !== currentChatInputHeight) {
       if (!isScrolled && chatInputHeight < currentChatInputHeight) {
@@ -477,18 +478,6 @@ new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-in
 // Handle changes in window size
 window.addEventListener("resize", updateCssProperties);
 
-//------------------------------------------------
-// Keep track of the display width to position the past
-// chats dropdown on desktop
-//------------------------------------------------
-function updateDocumentWidth() {
-  var updatedWidth = window.innerWidth || document.documentElement.clientWidth || document.body.clientWidth;
-  document.documentElement.style.setProperty("--document-width", updatedWidth + "px");
-}
-
-updateDocumentWidth();
-window.addEventListener("resize", updateDocumentWidth);
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
@@ -568,6 +557,8 @@ function moveToChatTab() {
     grandParent.style.display = "none";
   }
 
+  grandParent.children[0].style.minWidth = "100%";
+
   const chatControlsFirstChild = document.querySelector("#chat-controls").firstElementChild;
   const newParent = chatControlsFirstChild;
   let newPosition = newParent.children.length - 2;
@@ -586,6 +577,7 @@ function restoreOriginalPosition() {
 
     document.getElementById("save-character").style.display = "";
     movedElement.style.display = "";
+    movedElement.children[0].style.minWidth = "";
   }
 }
 
@@ -612,3 +604,222 @@ window.addEventListener("beforeunload", function (event) {
 });
 
 moveToChatTab();
+
+//------------------------------------------------
+// Buttons to toggle the sidebars
+//------------------------------------------------
+
+const leftArrowSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-arrow-bar-left">
+  <path d="M4 12l10 0"></path>
+  <path d="M4 12l4 4"></path>
+  <path d="M4 12l4 -4"></path>
+  <path d="M20 4l0 16"></path>
+</svg>`;
+
+const rightArrowSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-arrow-bar-right">
+  <path d="M20 12l-10 0"></path>
+  <path d="M20 12l-4 4"></path>
+  <path d="M20 12l-4 -4"></path>
+  <path d="M4 4l0 16"></path>
+</svg>`;
+
+const hamburgerMenuSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-hamburger-menu">
+  <line x1="3" y1="12" x2="21" y2="12"></line>
+  <line x1="3" y1="6" x2="21" y2="6"></line>
+  <line x1="3" y1="18" x2="21" y2="18"></line>
+</svg>`;
+
+const closeMenuSVG = `
+<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-close-menu">
+  <line x1="18" y1="6" x2="6" y2="18"></line>
+  <line x1="6" y1="6" x2="18" y2="18"></line>
+</svg>`;
+
+const chatTab = document.getElementById("chat-tab");
+const pastChatsRow = document.getElementById("past-chats-row");
+const chatControlsRow = document.getElementById("chat-controls");
+
+if (chatTab) {
+  // Create past-chats-toggle div
+  const pastChatsToggle = document.createElement("div");
+  pastChatsToggle.id = "past-chats-toggle";
+  pastChatsToggle.innerHTML = leftArrowSVG; // Set initial icon to left arrow
+  pastChatsToggle.classList.add("past-chats-open"); // Set initial position
+
+  // Create chat-controls-toggle div
+  const chatControlsToggle = document.createElement("div");
+  chatControlsToggle.id = "chat-controls-toggle";
+  chatControlsToggle.innerHTML = rightArrowSVG; // Set initial icon to right arrow
+  chatControlsToggle.classList.add("chat-controls-open"); // Set initial position
+
+  // Append both elements to the chat-tab
+  chatTab.appendChild(pastChatsToggle);
+  chatTab.appendChild(chatControlsToggle);
+}
+
+// Create navigation toggle div
+const navigationToggle = document.createElement("div");
+navigationToggle.id = "navigation-toggle";
+navigationToggle.innerHTML = leftArrowSVG; // Set initial icon to right arrow
+navigationToggle.classList.add("navigation-left"); // Set initial position
+headerBar.appendChild(navigationToggle);
+
+// Retrieve the dynamically created toggle buttons
+const pastChatsToggle = document.getElementById("past-chats-toggle");
+const chatControlsToggle = document.getElementById("chat-controls-toggle");
+
+function handleIndividualSidebarClose(event) {
+  const target = event.target;
+
+  // Close navigation bar if click is outside and it is open
+  if (!headerBar.contains(target) && !headerBar.classList.contains("sidebar-hidden")) {
+    toggleSidebar(headerBar, navigationToggle, true);
+  }
+
+  // Close past chats row if click is outside and it is open
+  if (!pastChatsRow.contains(target) && !pastChatsRow.classList.contains("sidebar-hidden")) {
+    toggleSidebar(pastChatsRow, pastChatsToggle, true);
+  }
+
+  // Close chat controls row if click is outside and it is open
+  if (!chatControlsRow.contains(target) && !chatControlsRow.classList.contains("sidebar-hidden")) {
+    toggleSidebar(chatControlsRow, chatControlsToggle, true);
+  }
+}
+
+function toggleSidebar(sidebar, toggle, forceClose = false) {
+  const isCurrentlyHidden = sidebar.classList.contains("sidebar-hidden");
+  const shouldClose = !isCurrentlyHidden;
+
+  // Apply visibility classes
+  sidebar.classList.toggle("sidebar-hidden", shouldClose);
+  sidebar.classList.toggle("sidebar-shown", !shouldClose);
+
+  if (sidebar === headerBar) {
+    // Special handling for header bar
+    document.documentElement.style.setProperty("--header-width", shouldClose ? "0px" : "112px");
+    pastChatsRow.classList.toggle("negative-header", shouldClose);
+    pastChatsToggle.classList.toggle("negative-header", shouldClose);
+    toggle.innerHTML = shouldClose ? hamburgerMenuSVG : closeMenuSVG;
+  } else if (sidebar === pastChatsRow) {
+    // Past chats sidebar
+    toggle.classList.toggle("past-chats-closed", shouldClose);
+    toggle.classList.toggle("past-chats-open", !shouldClose);
+    toggle.innerHTML = shouldClose ? rightArrowSVG : leftArrowSVG;
+  } else if (sidebar === chatControlsRow) {
+    // Chat controls sidebar
+    toggle.classList.toggle("chat-controls-closed", shouldClose);
+    toggle.classList.toggle("chat-controls-open", !shouldClose);
+    toggle.innerHTML = shouldClose ? leftArrowSVG : rightArrowSVG;
+  }
+
+  // Mobile handling
+  if (isMobile()) {
+    sidebar.classList.toggle("sidebar-shown", !shouldClose);
+  }
+}
+
+// Function to check if the device is mobile
+function isMobile() {
+  return window.innerWidth <= 924;
+}
+
+// Function to initialize sidebars
+function initializeSidebars() {
+  const isOnMobile = isMobile();
+  
+  if (isOnMobile) {
+    // Mobile state: Hide sidebars and set closed states
+    [pastChatsRow, chatControlsRow, headerBar].forEach(el => {
+      el.classList.add("sidebar-hidden");
+      el.classList.remove("sidebar-shown");
+    });
+
+    document.documentElement.style.setProperty("--header-width", "0px");
+    pastChatsRow.classList.add("negative-header");
+    pastChatsToggle.classList.add("negative-header", "past-chats-closed");
+    pastChatsToggle.classList.remove("past-chats-open");
+
+    [chatControlsToggle, navigationToggle].forEach(el => {
+      el.classList.add("chat-controls-closed");
+      el.classList.remove("chat-controls-open");
+    });
+
+    pastChatsToggle.innerHTML = rightArrowSVG;
+    chatControlsToggle.innerHTML = leftArrowSVG;
+    navigationToggle.innerHTML = hamburgerMenuSVG;
+  } else {
+    // Desktop state: Show sidebars and set open states
+    [pastChatsRow, chatControlsRow].forEach(el => {
+      el.classList.remove("sidebar-hidden", "sidebar-shown");
+    });
+
+    pastChatsToggle.classList.add("past-chats-open");
+    pastChatsToggle.classList.remove("past-chats-closed");
+
+    [chatControlsToggle, navigationToggle].forEach(el => {
+      el.classList.add("chat-controls-open");
+      el.classList.remove("chat-controls-closed");
+    });
+
+    pastChatsToggle.innerHTML = leftArrowSVG;
+    chatControlsToggle.innerHTML = rightArrowSVG;
+    navigationToggle.innerHTML = closeMenuSVG;
+  }
+}
+
+// Run the initializer when the page loads
+initializeSidebars();
+
+// Add click event listeners to toggle buttons
+pastChatsToggle.addEventListener("click", () => {
+  toggleSidebar(pastChatsRow, pastChatsToggle);
+});
+
+chatControlsToggle.addEventListener("click", () => {
+  toggleSidebar(chatControlsRow, chatControlsToggle);
+});
+
+navigationToggle.addEventListener("click", () => {
+  toggleSidebar(headerBar, navigationToggle);
+});
+
+//------------------------------------------------
+// Fixes #chat-input textarea height issue
+// for devices with width <= 924px
+//------------------------------------------------
+
+if (isMobile()) {
+  // Target the textarea
+  const textarea = document.querySelector("#chat-input textarea");
+
+  if (textarea) {
+    // Simulate adding and removing a newline
+    textarea.value += "\n";
+    textarea.dispatchEvent(new Event("input", { bubbles: true }));
+    textarea.value = textarea.value.slice(0, -1);
+    textarea.dispatchEvent(new Event("input", { bubbles: true }));
+  }
+}
+
+//------------------------------------------------
+// Create a top navigation bar on mobile
+//------------------------------------------------
+
+function createMobileTopBar() {
+  const chatTab = document.getElementById("chat-tab");
+
+  // Only create the top bar if it doesn't already exist
+  if (chatTab && !chatTab.querySelector(".mobile-top-bar")) {
+    const topBar = document.createElement("div");
+    topBar.classList.add("mobile-top-bar");
+
+    // Insert the top bar as the first child of chat-tab
+    chatTab.appendChild(topBar);
+  }
+}
+
+createMobileTopBar();
diff --git a/js/show_controls.js b/js/show_controls.js
index 1ff88e52aa..1a87b52d96 100644
--- a/js/show_controls.js
+++ b/js/show_controls.js
@@ -1,4 +1,6 @@
-const belowChatInput = document.querySelectorAll("#chat-tab > div > :nth-child(n+2), #extensions");
+const belowChatInput = document.querySelectorAll(
+  "#chat-tab > div > :nth-child(1), #chat-tab > div > :nth-child(3), #chat-tab > div > :nth-child(4), #extensions"
+);
 const chatParent = document.querySelector(".chat-parent");
 
 function toggle_controls(value) {
diff --git a/modules/block_requests.py b/modules/block_requests.py
index 886930f0c0..6adc385a75 100644
--- a/modules/block_requests.py
+++ b/modules/block_requests.py
@@ -47,7 +47,7 @@ def my_open(*args, **kwargs):
         if len(args) > 1 and args[1] == 'rb':
             file_contents = file_contents.decode('utf-8')
 
-        file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.9/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
+        file_contents = file_contents.replace('\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', '')
         file_contents = file_contents.replace('cdnjs.cloudflare.com', '127.0.0.1')
         file_contents = file_contents.replace(
             '</head>',
diff --git a/modules/chat.py b/modules/chat.py
index b81cfea6ee..92808fb7de 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -352,13 +352,17 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
 
         # Extract the reply
-        visible_reply = reply
         if state['mode'] in ['chat', 'chat-instruct']:
-            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '❚')
+        else:
+            visible_reply = reply + '❚'
 
         visible_reply = html.escape(visible_reply)
 
         if shared.stop_everything:
+            if output['visible'][-1][1].endswith('❚'):
+                output['visible'][-1][1] = output['visible'][-1][1][:-1]
+
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
             yield output
             return
@@ -374,6 +378,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if is_stream:
                 yield output
 
+    if output['visible'][-1][1].endswith('❚'):
+        output['visible'][-1][1] = output['visible'][-1][1][:-1]
+
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 
@@ -606,9 +613,9 @@ def find_all_histories_with_first_prompts(state):
 
         first_prompt = first_prompt.strip()
 
-        # Truncate the first prompt if it's longer than 32 characters
-        if len(first_prompt) > 32:
-            first_prompt = first_prompt[:29] + '...'
+        # Truncate the first prompt if it's longer than 30 characters
+        if len(first_prompt) > 30:
+            first_prompt = first_prompt[:30-3] + '...'
 
         result.append((first_prompt, filename))
 
@@ -1087,9 +1094,8 @@ def handle_delete_chat_confirm_click(state):
 
 def handle_rename_chat_click():
     return [
-        gr.update(visible=True, value="My New Chat"),
+        gr.update(value="My New Chat"),
         gr.update(visible=True),
-        gr.update(visible=True)
     ]
 
 
@@ -1100,16 +1106,14 @@ def handle_rename_chat_confirm(rename_to, state):
     return [
         gr.update(choices=histories, value=rename_to),
         gr.update(visible=False),
-        gr.update(visible=False),
-        gr.update(visible=False)
     ]
 
 
 def handle_upload_chat_history(load_chat_history, state):
     history = start_new_chat(state)
     history = load_history_json(load_chat_history, history)
-    histories = find_all_histories_with_first_prompts(state)
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+    histories = find_all_histories_with_first_prompts(state)
 
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
@@ -1209,7 +1213,7 @@ def handle_delete_template_click(template):
     return [
         f"{template}.yaml",
         "instruction-templates/",
-        gr.update(visible=True)
+        gr.update(visible=False)
     ]
 
 
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
index 0498c4882e..9b6da83c87 100644
--- a/modules/exllamav2.py
+++ b/modules/exllamav2.py
@@ -2,17 +2,19 @@
 from pathlib import Path
 
 import torch
+
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_Q6,
+    ExLlamaV2Cache_Q8,
     ExLlamaV2Cache_TP,
     ExLlamaV2Config,
     ExLlamaV2Tokenizer
 )
 from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
-
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
@@ -57,12 +59,20 @@ def from_pretrained(self, path_to_model):
             model.load(split)
 
         # Determine the correct cache type
-        if shared.args.cache_8bit:
+        kv_cache_type = shared.args.cache_type.lower()
+
+        if kv_cache_type == 'fp16':
+            cache_type = ExLlamaV2Cache
+        elif kv_cache_type == 'fp8':
             cache_type = ExLlamaV2Cache_8bit
-        elif shared.args.cache_4bit:
+        elif kv_cache_type == 'q8':
+            cache_type = ExLlamaV2Cache_Q8
+        elif kv_cache_type == 'q6':
+            cache_type = ExLlamaV2Cache_Q6
+        elif kv_cache_type == 'q4':
             cache_type = ExLlamaV2Cache_Q4
         else:
-            cache_type = ExLlamaV2Cache
+            raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
 
         # Use TP if specified
         if shared.args.enable_tp:
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
index 320a8d2467..62d1e0547c 100644
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@@ -4,18 +4,20 @@
 from typing import Any, Dict, Optional, Union
 
 import torch
+from torch.nn import CrossEntropyLoss
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_Q6,
+    ExLlamaV2Cache_Q8,
     ExLlamaV2Cache_TP,
     ExLlamaV2Config
 )
-from torch.nn import CrossEntropyLoss
-from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
 from modules import shared
 from modules.logging_colors import logger
 
@@ -45,12 +47,20 @@ def __init__(self, config: ExLlamaV2Config):
             self.ex_model.load(split)
 
         # Determine the correct cache type
-        if shared.args.cache_8bit:
+        kv_cache_type = shared.args.cache_type.lower()
+
+        if kv_cache_type == 'fp16':
+            cache_type = ExLlamaV2Cache
+        elif kv_cache_type == 'fp8':
             cache_type = ExLlamaV2Cache_8bit
-        elif shared.args.cache_4bit:
+        elif kv_cache_type == 'q8':
+            cache_type = ExLlamaV2Cache_Q8
+        elif kv_cache_type == 'q6':
+            cache_type = ExLlamaV2Cache_Q6
+        elif kv_cache_type == 'q4':
             cache_type = ExLlamaV2Cache_Q4
         else:
-            cache_type = ExLlamaV2Cache
+            raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
 
         # Use TP if specified
         if shared.args.enable_tp:
diff --git a/modules/html_generator.py b/modules/html_generator.py
index d0afd6b213..01b2086610 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -104,6 +104,8 @@ def convert_to_markdown(string):
     result = ''
     is_code = False
     is_latex = False
+    previous_line_empty = True
+
     for line in string.split('\n'):
         stripped_line = line.strip()
 
@@ -120,13 +122,20 @@ def convert_to_markdown(string):
         elif stripped_line.endswith('\\\\]'):
             is_latex = False
 
-        result += line
-
-        # Don't add an extra \n for tables, code, or LaTeX
-        if is_code or is_latex or line.startswith('|'):
-            result += '\n'
+        # Preserve indentation for lists and code blocks
+        if stripped_line.startswith('-') or stripped_line.startswith('*') or stripped_line.startswith('+') or stripped_line.startswith('>') or re.match(r'\d+\.', stripped_line):
+            result += line + '\n'
+            previous_line_empty = False
+        elif is_code or is_latex or line.startswith('|'):
+            result += line + '\n'
+            previous_line_empty = False
         else:
-            result += '\n\n'
+            if previous_line_empty:
+                result += line.strip() + '\n'
+            else:
+                result += line.strip() + '\n\n'
+
+            previous_line_empty = stripped_line == ''
 
     result = result.strip()
     if is_code:
@@ -145,14 +154,15 @@ def convert_to_markdown(string):
         result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result)
 
         # Convert to HTML using markdown
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'], tab_length=2)
 
         # Remove the delete string from the HTML output
         pos = html_output.rfind(delete_str)
         if pos > -1:
             html_output = html_output[:pos] + html_output[pos + len(delete_str):]
     else:
-        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        # Convert to HTML using markdown
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'], tab_length=2)
 
     # Unescape code blocks
     pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
index 6611a7c1a8..f9964fe8b0 100644
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@@ -9,6 +9,7 @@
 
 from modules import shared
 from modules.llama_cpp_python_hijack import llama_cpp_lib
+from modules.llamacpp_model import get_llamacpp_cache_type_for_string
 from modules.logging_colors import logger
 
 
@@ -196,12 +197,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             'flash_attn': shared.args.flash_attn
         }
 
-        if shared.args.cache_4bit:
-            params["type_k"] = 2
-            params["type_v"] = 2
-        elif shared.args.cache_8bit:
-            params["type_k"] = 8
-            params["type_v"] = 8
+        if shared.args.cache_type != 'fp16':
+            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
+            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
 
         Llama = llama_cpp_lib().Llama
         model = Llama(**params)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
index 96f7ed56b5..6a76ee4e95 100644
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@@ -10,6 +10,35 @@
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
 
+llamacpp_quant_mapping = {
+    'f32': 0,
+    'fp16': 1,
+    'q4_0': 2,
+    'q4_1': 3,
+    'q5_0': 6,
+    'q5_1': 7,
+    'q8_0': 8,
+    'q8_1': 9,
+    'q2_k': 10,
+    'q3_k': 11,
+    'q4_k': 12,
+    'q5_k': 13,
+    'q6_k': 14,
+    'q8_k': 15,
+    'iq4_nl': 20,
+    'bf16': 30,
+}
+
+llamacpp_valid_cache_types = {'fp16', 'q8_0', 'q4_0'}
+
+
+def get_llamacpp_cache_type_for_string(quant_type: str):
+    quant_type = quant_type.lower()
+    if quant_type in llamacpp_valid_cache_types:
+        return llamacpp_quant_mapping[quant_type]
+    else:
+        raise ValueError(f"Invalid cache type for llama.cpp: {quant_type}. Valid options are: fp16, q8_0, q4_0.")
+
 
 def ban_eos_logits_processor(eos_token, input_ids, logits):
     logits[eos_token] = -float('inf')
@@ -75,12 +104,9 @@ def from_pretrained(self, path):
             'flash_attn': shared.args.flash_attn
         }
 
-        if shared.args.cache_4bit:
-            params["type_k"] = 2
-            params["type_v"] = 2
-        elif shared.args.cache_8bit:
-            params["type_k"] = 8
-            params["type_v"] = 8
+        if shared.args.cache_type != 'fp16':
+            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
+            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
 
         result.model = Llama(**params)
         if cache_capacity > 0:
diff --git a/modules/loaders.py b/modules/loaders.py
index deee00a7f9..4cb7e349d6 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -31,8 +31,7 @@
     'llama.cpp': [
         'n_ctx',
         'n_gpu_layers',
-        'cache_8bit',
-        'cache_4bit',
+        'cache_type',
         'tensor_split',
         'n_batch',
         'threads',
@@ -54,8 +53,7 @@
     'llamacpp_HF': [
         'n_ctx',
         'n_gpu_layers',
-        'cache_8bit',
-        'cache_4bit',
+        'cache_type',
         'tensor_split',
         'n_batch',
         'threads',
@@ -87,8 +85,7 @@
         'no_xformers',
         'no_sdpa',
         'num_experts_per_token',
-        'cache_8bit',
-        'cache_4bit',
+        'cache_type',
         'autosplit',
         'enable_tp',
         'alpha_value',
@@ -103,8 +100,7 @@
         'no_xformers',
         'no_sdpa',
         'num_experts_per_token',
-        'cache_8bit',
-        'cache_4bit',
+        'cache_type',
         'autosplit',
         'enable_tp',
         'alpha_value',
diff --git a/modules/shared.py b/modules/shared.py
index 894ed6fe56..cab612268a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -81,7 +81,6 @@
 group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
-group.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.')
 group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')
 
 # Model loader
@@ -143,8 +142,6 @@
 group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
 group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
 group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
-group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
-group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.')
 group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
@@ -167,6 +164,10 @@
 group = parser.add_argument_group('TensorRT-LLM')
 group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
 
+# Cache
+group = parser.add_argument_group('Cache')
+group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
+
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -191,6 +192,7 @@
 group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certificate key file.', default=None)
 group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
 group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
+group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
 
 # API
 group = parser.add_argument_group('API')
@@ -213,6 +215,9 @@
 group.add_argument('--checkpoint', type=str, help='DEPRECATED')
 group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
 group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')
+group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
+group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
+group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -269,6 +274,58 @@ def fix_loader_name(name):
         return 'TensorRT-LLM'
 
 
+def transform_legacy_kv_cache_options(opts):
+    # Handle both argparse.Namespace and dict here
+    def get(key):
+        return opts.get(key) if isinstance(opts, dict) else getattr(opts, key, None)
+
+    def set(key, value):
+        if isinstance(opts, dict):
+            opts[key] = value
+        else:
+            setattr(opts, key, value)
+
+    def del_key(key, fallback_set):
+        # only remove from user dict, can't delete from argparse.Namespace
+        if type(opts) is dict:
+            if key in opts:
+                del opts[key]
+        else:
+            setattr(opts, key, fallback_set)
+
+    # Retrieve values
+    loader = get('loader')
+    cache_8bit = get('cache_8bit')
+    cache_4bit = get('cache_4bit')
+
+    # Determine cache type based on loader or legacy flags
+    if cache_8bit or cache_4bit:
+        if not loader:
+            # Legacy behavior: prefer 8-bit over 4-bit to minimize breakage
+            if cache_8bit:
+                set('cache_type', 'fp8')
+            elif cache_4bit:
+                set('cache_type', 'q4')
+        elif loader.lower() in ['exllamav2', 'exllamav2_hf']:
+            # ExLlamaV2 loader-specific cache type
+            if cache_8bit:
+                set('cache_type', 'fp8')
+            elif cache_4bit:
+                set('cache_type', 'q4')
+        elif loader.lower() in ['llama.cpp', 'llamacpp_hf']:
+            # Llama.cpp loader-specific cache type
+            if cache_4bit:
+                set('cache_type', 'q4_0')
+            elif cache_8bit:
+                set('cache_type', 'q8_0')
+
+    # Clean up legacy keys
+    del_key('cache_4bit', False)
+    del_key('cache_8bit', False)
+
+    return opts
+
+
 def add_extension(name, last=False):
     if args.extensions is None:
         args.extensions = [name]
@@ -297,10 +354,14 @@ def load_user_config():
     else:
         user_config = {}
 
+    for model_name in user_config:
+        user_config[model_name] = transform_legacy_kv_cache_options(user_config[model_name])
+
     return user_config
 
 
 args.loader = fix_loader_name(args.loader)
+args = transform_legacy_kv_cache_options(args)
 
 # Activate the multimodal extension
 if args.multimodal_pipeline is not None:
diff --git a/modules/ui.py b/modules/ui.py
index c07beeb466..4bfea9fade 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -50,6 +50,50 @@
     button_secondary_border_color="var(--border-color-primary)"
 )
 
+if not shared.args.old_colors:
+    theme = theme.set(
+        # General Colors
+        border_color_primary='#c5c5d2',
+        body_text_color_subdued='#484848',
+        background_fill_secondary='#eaeaea',
+        background_fill_secondary_dark='var(--selected-item-color-dark)',
+        background_fill_primary='var(--neutral-50)',
+        background_fill_primary_dark='var(--darker-gray)',
+        body_background_fill="white",
+        block_background_fill="transparent",
+        body_text_color="#333",
+        button_secondary_background_fill="#f4f4f4",
+        button_secondary_border_color="var(--border-color-primary)",
+
+        # Dark Mode Colors
+        input_background_fill_dark='var(--darker-gray)',
+        checkbox_background_color_dark='var(--darker-gray)',
+        block_background_fill_dark='transparent',
+        block_border_color_dark='transparent',
+        input_border_color_dark='var(--border-color-dark)',
+        checkbox_border_color_dark='var(--border-color-dark)',
+        border_color_primary_dark='var(--border-color-dark)',
+        button_secondary_border_color_dark='var(--border-color-dark)',
+        body_background_fill_dark='var(--dark-gray)',
+        button_primary_background_fill_dark='transparent',
+        button_secondary_background_fill_dark='transparent',
+        checkbox_label_background_fill_dark='transparent',
+        button_cancel_background_fill_dark='transparent',
+        button_secondary_background_fill_hover_dark='var(--selected-item-color-dark)',
+        checkbox_label_background_fill_hover_dark='var(--selected-item-color-dark)',
+        table_even_background_fill_dark='var(--darker-gray)',
+        table_odd_background_fill_dark='var(--selected-item-color-dark)',
+        code_background_fill_dark='var(--darker-gray)',
+
+        # Shadows and Radius
+        checkbox_label_shadow='none',
+        block_shadow='none',
+        block_shadow_dark='none',
+        button_large_radius='0.375rem',
+        button_large_padding='6px 12px',
+        input_radius='0.375rem',
+    )
+
 if Path("notification.mp3").exists():
     audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
 else:
@@ -87,8 +131,7 @@ def list_model_elements():
         'no_xformers',
         'no_sdpa',
         'num_experts_per_token',
-        'cache_8bit',
-        'cache_4bit',
+        'cache_type',
         'autosplit',
         'enable_tp',
         'threads',
@@ -232,10 +275,10 @@ def gather_interface_values(*args):
 def apply_interface_values(state, use_persistent=False):
     if use_persistent:
         state = shared.persistent_interface_state
-        if 'textbox-default' in state:
+        if 'textbox-default' in state and 'prompt_menu-default' in state:
             state.pop('prompt_menu-default')
 
-        if 'textbox-notebook' in state:
+        if 'textbox-notebook' and 'prompt_menu-notebook' in state:
             state.pop('prompt_menu-notebook')
 
     elements = list_interface_input_elements()
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 57143cd8c0..e372f5c223 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -12,7 +12,6 @@
 
 inputs = ('Chat input', 'interface_state')
 reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style', 'character_menu')
-clear_arr = ('delete_chat-confirm', 'delete_chat', 'delete_chat-cancel')
 
 
 def create_ui():
@@ -21,7 +20,27 @@ def create_ui():
     shared.gradio['Chat input'] = gr.State()
     shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False)
 
-    with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)):
+    with gr.Tab('Chat', elem_id='chat-tab'):
+        with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
+            with gr.Column():
+                with gr.Row(elem_id='past-chats-buttons'):
+                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
+
+                with gr.Row(elem_id='delete-chat-row', visible=False) as shared.gradio['delete-chat-row']:
+                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
+                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', elem_classes=['refresh-button', 'focus-on-chat-input'])
+
+                with gr.Row(elem_id='rename-row', visible=False) as shared.gradio['rename-row']:
+                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', elem_classes=['no-background'])
+                    with gr.Row():
+                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', elem_classes=['refresh-button', 'focus-on-chat-input'])
+                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', elem_classes=['refresh-button', 'focus-on-chat-input'], variant='primary')
+
+                with gr.Row():
+                    shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
+
         with gr.Row():
             with gr.Column(elem_id='chat-col'):
                 shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', ''))
@@ -60,25 +79,6 @@ def create_ui():
                 shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
                 shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
 
-        with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
-            with gr.Column():
-                with gr.Row():
-                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
-                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
-
-                with gr.Row(elem_id='rename-row'):
-                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
-                    with gr.Row():
-                        shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-                        shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input'])
-
-                gr.Markdown("Past chats")
-                with gr.Row():
-                    shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats')
-
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row():
@@ -180,29 +180,39 @@ def create_event_handlers():
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Regenerate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Continue'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Impersonate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
+        lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then(
+        None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Replace last reply'].click(
@@ -234,21 +244,21 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_start_new_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
 
-    shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
-    shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
+    shared.gradio['delete_chat'].click(lambda: gr.update(visible=True), None, gradio('delete-chat-row'))
+    shared.gradio['delete_chat-cancel'].click(lambda: gr.update(visible=False), None, gradio('delete-chat-row'))
     shared.gradio['delete_chat-confirm'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id') + gradio(clear_arr), show_progress=False)
+        chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'delete-chat-row'), show_progress=False)
 
-    shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
-    shared.gradio['rename_to-cancel'].click(lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+    shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
+    shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
     shared.gradio['rename_to-confirm'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename-row'))
 
     shared.gradio['rename_to'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+        chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename-row'), show_progress=False)
 
     shared.gradio['load_chat_history'].upload(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
diff --git a/modules/ui_default.py b/modules/ui_default.py
index 112acd2358..ccae9a5ec3 100644
--- a/modules/ui_default.py
+++ b/modules/ui_default.py
@@ -20,12 +20,12 @@ def create_ui():
             with gr.Column():
                 with gr.Row():
                     shared.gradio['textbox-default'] = gr.Textbox(value='', lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
-                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter", "default-token-counter"])
+                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_id="default-token-counter")
 
                 with gr.Row():
-                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
-                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
                     shared.gradio['Continue-default'] = gr.Button('Continue')
+                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop', visible=False)
+                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
 
                 with gr.Row():
                     shared.gradio['prompt_menu-default'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
@@ -63,20 +63,26 @@ def create_ui():
 def create_event_handlers():
     shared.gradio['Generate-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-default', 'Generate-default')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-default', 'Generate-default')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-default'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-default', 'Generate-default')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-default', 'Generate-default')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Continue-default'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-default', 'Generate-default')).then(
         generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
         lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-default', 'Generate-default')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index f87b680aeb..189bedfdf1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -118,8 +118,7 @@ def create_ui():
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                             shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.')
-                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
-                            shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.')
+                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
                             shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
@@ -195,13 +194,13 @@ def create_event_handlers():
     shared.gradio['model_menu'].change(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(
+        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
     shared.gradio['load_model'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         update_model_parameters, gradio('interface_state'), None).then(
-        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
     shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
@@ -260,6 +259,8 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             yield ("Please enter a model path")
             return
 
+        repo_id = repo_id.strip()
+        specific_file = specific_file.strip()
         downloader = importlib.import_module("download-model").ModelDownloader()
 
         progress(0.0)
@@ -297,7 +298,7 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             downloader.check_model_files(model, branch, links, sha256, output_folder)
             progress(1.0)
         else:
-            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}`")
+            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
             downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
 
             yield (f"Model successfully saved to `{output_folder}/`.")
@@ -317,7 +318,7 @@ def create_llamacpp_hf(gguf_name, unquantized_url, progress=gr.Progress()):
         links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=True)
         output_folder = Path(shared.args.model_dir) / (re.sub(r'(?i)\.gguf$', '', gguf_name) + "-HF")
 
-        yield (f"Downloading tokenizer to `{output_folder}`")
+        yield (f"Downloading tokenizer to `{output_folder}/`")
         downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=False)
 
         # Move the GGUF
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
index 799328447c..b234ac5753 100644
--- a/modules/ui_notebook.py
+++ b/modules/ui_notebook.py
@@ -23,7 +23,7 @@ def create_ui():
                 with gr.Tab('Raw'):
                     with gr.Row():
                         shared.gradio['textbox-notebook'] = gr.Textbox(value='', lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
-                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter"])
+                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_id="notebook-token-counter")
 
                 with gr.Tab('Markdown'):
                     shared.gradio['markdown_render-notebook'] = gr.Button('Render')
@@ -48,10 +48,10 @@ def create_ui():
                     shared.gradio['tokens-notebook'] = gr.Textbox(lines=23, label='Tokens', elem_classes=['textbox_logits_notebook', 'add_scrollbar', 'monospace'])
 
                 with gr.Row():
-                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
-                    shared.gradio['Stop-notebook'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
                     shared.gradio['Undo'] = gr.Button('Undo', elem_classes='small-button')
                     shared.gradio['Regenerate-notebook'] = gr.Button('Regenerate', elem_classes='small-button')
+                    shared.gradio['Stop-notebook'] = gr.Button('Stop', visible=False, elem_classes='small-button', elem_id='stop')
+                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
 
             with gr.Column(scale=1):
                 gr.HTML('<div style="padding-bottom: 13px"></div>')
@@ -66,22 +66,28 @@ def create_event_handlers():
     shared.gradio['Generate-notebook'].click(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['textbox-notebook'].submit(
         lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Regenerate-notebook'].click(
         lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: [gr.update(visible=True), gr.update(visible=False)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
         lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True)], None, gradio('Stop-notebook', 'Generate-notebook')).then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
     shared.gradio['Undo'].click(
diff --git a/requirements.txt b/requirements.txt
index 2549c64864..24c92391ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-accelerate==1.0.*
-bitsandbytes==0.44.*
+accelerate==1.2.*
+bitsandbytes==0.45.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -32,30 +32,30 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_amd.txt b/requirements_amd.txt
index 78bdd3ca84..b7093d5099 100644
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -31,14 +31,14 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.1+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.1+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.5+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.5+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
index 9420e861f8..88682aea04 100644
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -31,12 +31,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
index 625021ee1f..6588278d03 100644
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -31,8 +31,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6-py3-none-any.whl
diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
index 3cb66cbc81..1fc9795bb3 100644
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -31,10 +31,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.1-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.5-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6-py3-none-any.whl
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index fbd6447bdf..53fedd7ec5 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index e9ab0fbad3..9f52b17283 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 99791ea9b4..9ad138d8d4 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -1,10 +1,10 @@
-accelerate==1.0.*
-bitsandbytes==0.44.*
+accelerate==1.2.*
+bitsandbytes==0.45.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -21,7 +21,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb
 
@@ -32,30 +32,30 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.1+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.5+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.1+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.5+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.1+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.5+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.2.post1/flash_attn-2.7.2.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index f5c3966eb3..e2daebd9ec 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -1,9 +1,9 @@
-accelerate==1.0.*
+accelerate==1.2.*
 colorama
 datasets
 einops
 fastapi==0.112.4
-gradio==4.26.*
+gradio==4.37.*
 jinja2==3.1.4
 markdown
 numba==0.59.*
@@ -20,7 +20,7 @@ safetensors==0.4.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.46.*
+transformers==4.47.*
 tqdm
 wandb