Skip to content

Commit

Permalink
Merge pull request #6300 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga authored Aug 1, 2024
2 parents 498fec2 + 608545d commit d011040
Show file tree
Hide file tree
Showing 23 changed files with 123 additions and 93 deletions.
3 changes: 2 additions & 1 deletion css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -404,13 +404,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.message-body h3,
.message-body h4 {
color: var(--body-text-color);
margin: 20px 0 10px 0;
}

.dark .message q {
color: #f5b031;
}

.message q::before, .message q::after {
.message-body q::before, .message-body q::after {
content: "";
}

Expand Down
10 changes: 7 additions & 3 deletions download-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,15 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
total_size = int(r.headers.get('content-length', 0))
block_size = 1024 * 1024 # 1MB

filename_str = str(filename) # Convert PosixPath to string if necessary

tqdm_kwargs = {
'total': total_size,
'unit': 'iB',
'unit': 'B',
'unit_scale': True,
'bar_format': '{l_bar}{bar}| {n_fmt}/{total_fmt} {rate_fmt}'
'unit_divisor': 1024,
'bar_format': '{desc}{percentage:3.0f}%|{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]',
'desc': f"{filename_str}: "
}

if 'COLAB_GPU' in os.environ:
Expand All @@ -233,7 +237,7 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
t.update(len(data))
if total_size != 0 and self.progress_bar is not None:
count += len(data)
self.progress_bar(float(count) / float(total_size), f"{filename}")
self.progress_bar(float(count) / float(total_size), f"{filename_str}")

break # Exit loop if successful
except (RequestException, ConnectionError, Timeout) as e:
Expand Down
7 changes: 3 additions & 4 deletions extensions/openai/completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,6 @@ def chat_streaming_chunk(content):
yield {'prompt': prompt}
return

token_count = len(encode(prompt)[0])
debug_msg({'prompt': prompt, 'generate_params': generate_params})

if stream:
Expand All @@ -330,7 +329,6 @@ def chat_streaming_chunk(content):

answer = ''
seen_content = ''
completion_token_count = 0

for a in generator:
answer = a['internal'][-1][1]
Expand All @@ -345,6 +343,7 @@ def chat_streaming_chunk(content):
chunk = chat_streaming_chunk(new_content)
yield chunk

token_count = len(encode(prompt)[0])
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
Expand Down Expand Up @@ -429,8 +428,6 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
prompt = decode(prompt)[0]

prefix = prompt if echo else ''
token_count = len(encode(prompt)[0])
total_prompt_token_count += token_count

# generate reply #######################################
debug_msg({'prompt': prompt, 'generate_params': generate_params})
Expand All @@ -440,6 +437,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
for a in generator:
answer = a

token_count = len(encode(prompt)[0])
total_prompt_token_count += token_count
completion_token_count = len(encode(answer)[0])
total_completion_token_count += completion_token_count
stop_reason = "stop"
Expand Down
8 changes: 7 additions & 1 deletion js/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ function doSyntaxHighlighting() {
renderMathInElement(element, {
delimiters: [
{ left: "$$", right: "$$", display: true },
{ left: "$", right: "$", display: false },
{ left: "\\(", right: "\\)", display: false },
{ left: "\\[", right: "\\]", display: true },
],
Expand Down Expand Up @@ -459,7 +460,12 @@ function updateCssProperties() {

// Adjust scrollTop based on input height change
if (chatInputHeight !== currentChatInputHeight) {
chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
if (!isScrolled && chatInputHeight < currentChatInputHeight) {
chatContainer.scrollTop = chatContainer.scrollHeight;
} else {
chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
}

currentChatInputHeight = chatInputHeight;
}
}
Expand Down
20 changes: 10 additions & 10 deletions modules/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
from modules.text_generation import (
generate_reply,
get_encoded_length,
get_max_prompt_length,
stop_everything_event
get_max_prompt_length
)
from modules.utils import delete_file, get_available_characters, save_file

Expand Down Expand Up @@ -93,8 +92,16 @@ def generate_chat_prompt(user_input, state, **kwargs):
chat_template_str = replace_character_names(chat_template_str, state['name1'], state['name2'])

instruction_template = jinja_env.from_string(state['instruction_template_str'])
instruct_renderer = partial(instruction_template.render, add_generation_prompt=False)
chat_template = jinja_env.from_string(chat_template_str)

instruct_renderer = partial(
instruction_template.render,
builtin_tools=None,
tools=None,
tools_in_user_message=False,
add_generation_prompt=False
)

chat_renderer = partial(
chat_template.render,
add_generation_prompt=False,
Expand Down Expand Up @@ -1036,13 +1043,6 @@ def handle_remove_last_click(state):
return [history, html, last_input]


def handle_stop_click(state):
stop_everything_event()
html = redraw_html(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

return html


def handle_unique_id_select(state):
history = load_history(state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
Expand Down
27 changes: 25 additions & 2 deletions modules/html_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ def replace_blockquote(m):
@functools.lru_cache(maxsize=None)
def convert_to_markdown(string):

# Make \[ \] LaTeX equations inline
pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$'
replacement = r'\\[ \1 \\]'
string = re.sub(pattern, replacement, string, flags=re.MULTILINE)

# Escape backslashes
string = string.replace('\\', '\\\\')

# Quote to <q></q>
string = replace_quotes(string)

Expand All @@ -95,12 +103,27 @@ def convert_to_markdown(string):

result = ''
is_code = False
is_latex = False
for line in string.split('\n'):
if line.lstrip(' ').startswith('```'):
stripped_line = line.strip()

if stripped_line.startswith('```'):
is_code = not is_code
elif stripped_line.startswith('$$'):
is_latex = not is_latex
elif stripped_line.endswith('$$'):
is_latex = False
elif stripped_line.startswith('\\\\['):
is_latex = True
elif stripped_line.startswith('\\\\]'):
is_latex = False
elif stripped_line.endswith('\\\\]'):
is_latex = False

result += line
if is_code or line.startswith('|'): # Don't add an extra \n for tables or code

# Don't add an extra \n for tables, code, or LaTeX
if is_code or is_latex or line.startswith('|'):
result += '\n'
else:
result += '\n\n'
Expand Down
4 changes: 2 additions & 2 deletions modules/logits.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@


def get_next_logits(*args, **kwargs):
if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
shared.model, shared.tokenizer = load_model(shared.previous_model_name)
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
shared.model, shared.tokenizer = load_model(shared.model_name)

needs_lock = not args[2] # use_samplers
if needs_lock:
Expand Down
9 changes: 5 additions & 4 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,14 +368,15 @@ def clear_torch_cache():
torch.cuda.empty_cache()


def unload_model():
def unload_model(keep_model_name=False):
shared.model = shared.tokenizer = None
shared.previous_model_name = shared.model_name
shared.model_name = 'None'
shared.lora_names = []
shared.model_dirty_from_training = False
clear_torch_cache()

if not keep_model_name:
shared.model_name = 'None'


def reload_model():
unload_model()
Expand All @@ -393,7 +394,7 @@ def unload_model_if_idle():
if time.time() - last_generation_time > shared.args.idle_timeout * 60:
if shared.model is not None:
logger.info("Unloading the model for inactivity.")
unload_model()
unload_model(keep_model_name=True)
finally:
shared.generation_lock.release()

Expand Down
3 changes: 0 additions & 3 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
model = None
tokenizer = None
model_name = 'None'
previous_model_name = 'None'
is_seq2seq = False
model_dirty_from_training = False
lora_names = []
Expand Down Expand Up @@ -44,8 +43,6 @@
'negative_prompt': '',
'seed': -1,
'truncation_length': 2048,
'truncation_length_min': 0,
'truncation_length_max': 200000,
'max_tokens_second': 0,
'max_updates_second': 0,
'prompt_lookup_num_tokens': 0,
Expand Down
4 changes: 2 additions & 2 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@


def generate_reply(*args, **kwargs):
if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
shared.model, shared.tokenizer = load_model(shared.previous_model_name)
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
shared.model, shared.tokenizer = load_model(shared.model_name)

shared.generation_lock.acquire()
try:
Expand Down
2 changes: 1 addition & 1 deletion modules/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def create_ui():
stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')

with gr.Column():
max_length = gr.Slider(label='max_length', minimum=0, maximum=shared.settings['truncation_length_max'], value=0, step=256, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
max_length = gr.Number(label='max_length', precision=0, step=256, value=0, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')

with gr.Row():
start_current_evaluation = gr.Button("Evaluate loaded model", interactive=not mu)
Expand Down
5 changes: 3 additions & 2 deletions modules/ui_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from modules import chat, shared, ui, utils
from modules.html_generator import chat_html_wrapper
from modules.text_generation import stop_everything_event
from modules.utils import gradio

inputs = ('Chat input', 'interface_state')
Expand Down Expand Up @@ -221,8 +222,8 @@ def create_event_handlers():
chat.handle_remove_last_click, gradio('interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)

shared.gradio['Stop'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_stop_click, gradio('interface_state'), gradio('display'), show_progress=False)
stop_everything_event, None, None, queue=False).then(
chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)

if not shared.args.multi_user:
shared.gradio['unique_id'].select(
Expand Down
6 changes: 3 additions & 3 deletions modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,19 @@ def create_ui():

shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be set to more than 0 for your GPU to be used.')
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. Try lowering this if you run out of memory while loading the model.')
with gr.Blocks():
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=0, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')

shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')

Expand Down
2 changes: 1 addition & 1 deletion modules/ui_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def create_ui(default_preset):
shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.')

with gr.Column():
shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
Expand Down
Loading

0 comments on commit d011040

Please sign in to comment.