diff --git a/docs/references/sampling_params.md b/docs/references/sampling_params.md index 77d7c9f82e7..f20144057c0 100644 --- a/docs/references/sampling_params.md +++ b/docs/references/sampling_params.md @@ -1,280 +1,69 @@ # Sampling Parameters in SGLang Runtime + This doc describes the sampling parameters of the SGLang Runtime. It is the low-level endpoint of the runtime. -If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](../backend/openai_api_completions.ipynb). - -The `/generate` endpoint accepts the following arguments in the JSON format. - -```python -@dataclass -class GenerateReqInput: - # The input prompt. It can be a single prompt or a batch of prompts. - text: Optional[Union[List[str], str]] = None - # The token ids for text; one can specify either text or input_ids - input_ids: Optional[Union[List[List[int]], List[int]]] = None - # The embeddings for input_ids; one can specify either text or input_ids or input_embeds. - input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None - # The image input. It can be a file name, a url, or base64 encoded string. - # See also python/sglang/srt/utils.py:load_image. - image_data: Optional[Union[List[str], str]] = None - # The sampling_params. See descriptions below. - sampling_params: Optional[Union[List[Dict], Dict]] = None - # The request id. - rid: Optional[Union[List[str], str]] = None - # Whether to return logprobs. - return_logprob: Optional[Union[List[bool], bool]] = None - # If return logprobs, the start location in the prompt for returning logprobs. - # By default, this value is "-1", which means it will only return logprobs for output tokens. - logprob_start_len: Optional[Union[List[int], int]] = None - # If return logprobs, the number of top logprobs to return at each position. - top_logprobs_num: Optional[Union[List[int], int]] = None - # Whether to detokenize tokens in text in the returned logprobs. - return_text_in_logprobs: bool = False - # Whether to stream output. - stream: bool = False - # Whether to log metrics for this request (e.g. health_generate calls do not log metrics) - log_metrics: bool = True - - # The modalities of the image data [image, multi-images, video] - modalities: Optional[List[str]] = None - # LoRA related - lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None - - # Session info for continual prompting - session_params: Optional[Union[List[Dict], Dict]] = None - # Custom logit processor for advanced sampling control. Must be a serialized instance - # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py - # Use the processor's `to_str()` method to generate the serialized string. - custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None -``` - -The `sampling_params` follows this format - -```python -# The maximum number of output tokens -max_new_tokens: int = 128, -# Stop when hitting any of the strings in this list -stop: Optional[Union[str, List[str]]] = None, -# Stop when hitting any of the token_ids in this list -stop_token_ids: Optional[List[int]] = [], -# Sampling temperature -temperature: float = 1.0, -# Top-p sampling -top_p: float = 1.0, -# Top-k sampling -top_k: int = -1, -# Min-p sampling -min_p: float = 0.0, -# Whether to ignore EOS token -ignore_eos: bool = False, -# Whether to skip the special tokens during detokenization -skip_special_tokens: bool = True, -# Whether to add spaces between special tokens during detokenization -spaces_between_special_tokens: bool = True, -# Do parallel sampling and return `n` outputs. -n: int = 1, - -## Structured Outputs -# Only one of the below three can be set for a request. - -# Constrain the output to follow a given JSON schema. -json_schema: Optional[str] = None, -# Constrain the output to follow a given regular expression. -regex: Optional[str] = None, -# Constrain the output to follow a given EBNF grammar. -ebnf: Optional[str] = None, - -## Penalties. - -# Float that penalizes new tokens based on their frequency in the generated text so far. -# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to -# repeat tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty. -frequency_penalty: float = 0.0, -# Float that penalizes new tokens based on whether they appear in the generated text so far. -# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat -# tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty. -presence_penalty: float = 0.0, -# Float that penalizes new tokens based on whether they appear in the prompt and the generated text -# so far. Values > 1 encourage the model to use new tokens, while values < 1 encourage the model to -# repeat tokens. Must be 0 <= value <= 2. Setting to 1 (default) will disable this penalty. -repetition_penalty: float = 1.0, -# Guides inference to generate at least this number of tokens by penalizing logits of tokenizer's -# EOS token and `stop_token_ids` to -inf, until the output token reaches given length. -# Note that any of the `stop` string can be generated before reaching `min_new_tokens`, as it is -# difficult to infer the correct token ID by given `stop` strings. -# Must be 0 <= value < max_new_tokens. Setting to 0 (default) will disable this penalty. -min_new_tokens: int = 0, - - -## Custom Parameters for Custom Logit Processor. -# A dictionary of custom parameters for the custom logit processor. -# The custom logit processor takes a list of dictionaries as input, where each -# dictionary is the custom parameters for one token in a batch of the input. -# See also python/sglang/srt/sampling/custom_logit_processor.py -custom_params: Optional[Dict[str, Any]] = None, -``` - -## Examples - -### Normal -Launch a server -``` -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 -``` - -Send a request -```python -import requests - -response = requests.post( - "http://localhost:30000/generate", - json={ - "text": "The capital of France is", - "sampling_params": { - "temperature": 0, - "max_new_tokens": 32, - }, - }, -) -print(response.json()) -``` - -### Streaming -Send a request and stream the output -```python -import requests, json - -response = requests.post( - "http://localhost:30000/generate", - json={ - "text": "The capital of France is", - "sampling_params": { - "temperature": 0, - "max_new_tokens": 32, - }, - "stream": True, - }, - stream=True, -) +If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](https://docs.sglang.ai/backend/openai_api_completions.html). -prev = 0 -for chunk in response.iter_lines(decode_unicode=False): - chunk = chunk.decode("utf-8") - if chunk and chunk.startswith("data:"): - if chunk == "data: [DONE]": - break - data = json.loads(chunk[5:].strip("\n")) - output = data["text"].strip() - print(output[prev:], end="", flush=True) - prev = len(output) -print("") -``` +## `/generate` Endpoint -### Multi modal +The `/generate` endpoint accepts the following parameters in JSON format. For in detail usage see the [native api doc](https://docs.sglang.ai/backend/native_api.html). -Launch a server -``` -python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava -``` +* `text`: The input prompt. Can be a single prompt or a batch of prompts. +* `input_ids`: Alternative to `text`. Specify the input as token IDs instead of text. +* `input_embeds`: Alternative to `text` and `input_ids`. Specify the input as embeddings. +* `image_data`: Image input as a file path, URL, base64 string, raw bytes, or video frame. +* `sampling_params`: The sampling parameters as described in the sections below. +* `rid`: Request identifier. +* `return_logprob`: Whether to return log probabilities for tokens. +* `logprob_start_len`: If returning log probabilities, specifies the start position in the prompt. Default is "-1" which returns logprobs only for output tokens. +* `top_logprobs_num`: If returning log probabilities, specifies the number of top logprobs to return at each position. +* `return_text_in_logprobs`: Whether to include decoded text alongside token IDs in logprob outputs. +* `stream`: Whether to stream the output. +* `log_metrics`: Whether to log metrics for this request. +* `modalities`: The modalities of the image data. Can be `image`, `multi-images`, or `video`. +* `lora_path`: Path to LoRA weights. +* `session_params`: Session information, controls multi-turn conversation state. +* `custom_logit_processor`: Custom logit processor for advanced sampling control. For usage see below. -Download an image -``` -curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true -``` +## Sampling params -Send a request -```python -import requests +### Core Parameters -response = requests.post( - "http://localhost:30000/generate", - json={ - "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "<|im_start|>user\n\nDescribe this image in a very short sentence.<|im_end|>\n" - "<|im_start|>assistant\n", - "image_data": "example_image.png", - "sampling_params": { - "temperature": 0, - "max_new_tokens": 32, - }, - }, -) -print(response.json()) -``` +* `max_new_tokens`: The maximum output length measured in tokens. +* `stop`: One or multiple [stop words](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#let_the_model_know_when_to_stop). Generation will stop if one of these words is sampled. +* `stop_token_ids`: Provide stop words in form of token ids. Generation will stop if one of these token ids is sampled. +* `temperature`: [Temperature](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) when sampling the next token. `temperature = 0` corresponds to greedy sampling, higher temperature leads to more diversity. +* `top_p`: [Top-p](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. +* `top_k`: [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. +* `min_p`: [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. -The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`. -Streaming is supported in a similar manner as [above](#streaming). +### Penalizers -### Structured Outputs (JSON, Regex, EBNF) -You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request. +To use penalizers you will need to `--disable-overlap`. Please note that this might degrade performance. -SGLang supports two grammar backends: +* `frequency_penalty`: Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. +* `presence_penalty`: Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. +* `repetition_penalty`: Penalizes tokens if they appeared in prompt or generation so far. Must be between `0` and `2` where numbers smaller than `1` encourage repeatment of tokens and numbers larger than `2` encourages sampling of new tokens. The penalization scales multiplicatively. +* `min_new_tokens`: Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior for example if the distribution is highly skewed towards these tokens. -- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints. -- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints. - - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md) +### Constrained decoding -Initialize the XGrammar backend using `--grammar-backend xgrammar` flag -```bash -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ ---port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: outlines) -``` +Please refer to our dedicated guide on [constrained decoding](https://docs.sglang.ai/backend/structured_outputs.html#Native-API-and-SGLang-Runtime-(SRT)) for the following parameters. -```python -import json -import requests +* `json_schema` +* `regex` +* `ebnf` -json_schema = json.dumps({ - "type": "object", - "properties": { - "name": {"type": "string", "pattern": "^[\\w]+$"}, - "population": {"type": "integer"}, - }, - "required": ["name", "population"], -}) +### Other options -# JSON (works with both Outlines and XGrammar) -response = requests.post( - "http://localhost:30000/generate", - json={ - "text": "Here is the information of the capital of France in the JSON format.\n", - "sampling_params": { - "temperature": 0, - "max_new_tokens": 64, - "json_schema": json_schema, - }, - }, -) -print(response.json()) +* `n`: ? +* `spaces_between_special_tokens`: Whether or not to add spaces between special tokens during detokenization. +* `no_stop_trim`: Don't trim stop words or EOS token from the generated text. +* `ignore_eos`: Don't stop generation when EOS token is sampled. +* `skip_special_tokens`: Remove special tokens during decoding. +* `custom_params`: Used when employing `CustomLogitProcessor`. For usage see below. -# Regular expression (Outlines backend only) -response = requests.post( - "http://localhost:30000/generate", - json={ - "text": "Paris is the capital of", - "sampling_params": { - "temperature": 0, - "max_new_tokens": 64, - "regex": "(France|England)", - }, - }, -) -print(response.json()) -# EBNF (XGrammar backend only) -response = requests.post( - "http://localhost:30000/generate", - json={ - "text": "Write a greeting.", - "sampling_params": { - "temperature": 0, - "max_new_tokens": 64, - "ebnf": 'root ::= "Hello" | "Hi" | "Hey"', - }, - }, -) -print(response.json()) -``` ### Custom Logit Processor Launch a server with `--enable-custom-logit-processor` flag on. ``` @@ -282,6 +71,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ``` Define a custom logit processor that will always sample a specific token id. + ```python from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor @@ -304,6 +94,7 @@ class DeterministicLogitProcessor(CustomLogitProcessor): ``` Send a request + ```python import requests