From de619670d8f40c504600e0bfa1f3c392c442652e Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Thu, 10 Oct 2024 11:21:26 -0700 Subject: [PATCH 01/14] Fix block allocation for prefix caching --- router/src/scheduler.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/router/src/scheduler.rs b/router/src/scheduler.rs index a847936f0..d8baae290 100644 --- a/router/src/scheduler.rs +++ b/router/src/scheduler.rs @@ -370,19 +370,17 @@ impl AdapterSchedulerState { // If we're prefix caching, this check could be under-estimating the number of available blocks // due to shared prefixes, so we'll let the block allocator determine whether we have enough space. - if !self.prefix_caching { - if prefill_tokens > prefill_token_budget - || (prefill_tokens + decode_tokens + self.speculate) > token_budget - { - // Entry is over budget - // Add it back to the front - tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate); - self.queues_state - .lock() - .await - .push_front(&adapter, id, entry); - break; - } + if prefill_tokens > prefill_token_budget + || (prefill_tokens + decode_tokens + self.speculate) > token_budget + { + // Entry is over budget + // Add it back to the front + tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate); + self.queues_state + .lock() + .await + .push_front(&adapter, id, entry); + break; } let tokens = entry.request.input_length() From 734636c3a2f09f9742f11913cb95385da5c404fa Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 13 Oct 2024 18:24:14 -0700 Subject: [PATCH 02/14] TEMP: docker --- .github/workflows/build.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 56808422d..939d952e2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -5,6 +5,7 @@ on: push: branches: - 'main' + - 'return-n' tags: - 'v*' @@ -62,10 +63,7 @@ jobs: images: | ghcr.io/predibase/lorax tags: | - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=,suffix=,format=short - type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} + type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }} - name: Create a hash from tags env: From 98f9d34f9a5673bf4b0b6fd2b7a361d5fc68e641 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 13 Oct 2024 18:38:39 -0700 Subject: [PATCH 03/14] Fix --- router/src/scheduler.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/router/src/scheduler.rs b/router/src/scheduler.rs index d8baae290..7777938be 100644 --- a/router/src/scheduler.rs +++ b/router/src/scheduler.rs @@ -198,9 +198,6 @@ struct AdapterSchedulerState { /// Speculation amount speculate: u32, - /// Prefix caching - prefix_caching: bool, - /// Paged Attention Block Allocation block_allocator: Option, } @@ -242,7 +239,6 @@ impl AdapterSchedulerState { block_size, window_size, speculate, - prefix_caching, block_allocator, } } From e46e1e66450a635bb17d73b5427fbf51cb0a8370 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 13 Oct 2024 18:53:09 -0700 Subject: [PATCH 04/14] Change tag --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 939d952e2..753050076 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -63,7 +63,7 @@ jobs: images: | ghcr.io/predibase/lorax tags: | - type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }} + type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }} - name: Create a hash from tags env: From 279e09e381099e9bec71a2a89233c90c47ae5906 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 13 Oct 2024 19:26:20 -0700 Subject: [PATCH 05/14] Revert SOCI --- .github/workflows/build.yaml | 111 +++++------------------------------ 1 file changed, 15 insertions(+), 96 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 753050076..1f8752150 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -8,6 +8,11 @@ on: - 'return-n' tags: - 'v*' + pull_request: + types: [opened, synchronize, reopened] + branches: [ master ] + paths: + - '.github/workflows/build.yaml' jobs: build-and-push-image: @@ -28,7 +33,7 @@ jobs: uses: actions/checkout@v3 with: submodules: recursive - + - name: Free Disk Space (Ubuntu) uses: jlumbroso/free-disk-space@main with: @@ -39,23 +44,6 @@ jobs: large-packages: false swap-storage: true - - name: Install soci - uses: lerentis/soci-installer@v1.0.1 - with: - soci-release: 'v0.4.0' - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2.10.0 - - - name: Set up containerd for ubuntu - uses: crazy-max/ghaction-setup-containerd@v2.2.0 - with: - config-inline: | - version = 2 - - # persistent data location - root = "/runner/build/containerd" - - name: Docker meta id: meta uses: docker/metadata-action@v5 @@ -63,89 +51,20 @@ jobs: images: | ghcr.io/predibase/lorax tags: | - type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }} - - - name: Create a hash from tags - env: - tags: ${{ steps.meta.outputs.tags }} - id: vars - run: | - tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') - echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT - echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT - echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT - echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT - - - name: Create and update image/cache directory - env: - image_dir: ${{ steps.vars.outputs.image_dir }} - cache_dir: ${{ steps.vars.outputs.cache_dir }} - run: | - sudo mkdir -p $image_dir - sudo chown ubuntu:ubuntu $image_dir - - sudo mkdir -p $cache_dir - sudo chown ubuntu:ubuntu $cache_dir - - - name: Export Docker image as OCI - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile # Path to your Dockerfile - push: false - tags: ${{ steps.meta.outputs.tags }} - outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz - cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }} - cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }} - - - name: Import image in containerd - env: - tag_hash: ${{ steps.vars.outputs.tag_hash }} - image_path: ${{ steps.vars.outputs.image_path }} - run: | - echo "Importing $image_path-$tag_hash to Containerd" - sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz - + type=raw,value=return-n + - name: Log in to GitHub Container Registry uses: docker/login-action@v1 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GHCR_PAT }} - - - name: Push image with containerd - env: - tags: ${{ steps.meta.outputs.tags }} - run: | - for tag in $tags - do - echo "Pushing $tag to GHCR" - sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag - done - - - name: Create and push soci index - env: + + - name: Build and Push Image + uses: docker/build-push-action@v2 + with: + context: . + file: ./Dockerfile # Path to your Dockerfile + push: true tags: ${{ steps.meta.outputs.tags }} - run: | - export SOCI_PATH=$HOME/.soci/soci - for tag in $tags - do - echo "Creating soci index for $tag" - sudo $SOCI_PATH create $tag - echo "Pushing soci index for $tag" - sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag - done - - - name: Prune older images - env: - tag_hash: ${{ steps.vars.outputs.tag_hash }} - image_path: ${{ steps.vars.outputs.image_path }} - run: | - # Delete images older than a day from docker store - docker image prune -a -f --filter "until=24h" - - # Delete the on disk copy - rm -rf "$image_path-$tag_hash.tar.gz" - # Delete the SHA image(s) from containerd store - sudo ctr i rm $(sudo ctr i ls -q) From 3f6cd52dc21fb3dc9f3c52a9c1c392ed1dea490d Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Sun, 13 Oct 2024 21:41:29 -0700 Subject: [PATCH 06/14] Fix mixtral fp8 --- .../custom_modeling/flash_mixtral_modeling.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py b/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py index 0455788a0..2a7622c99 100644 --- a/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py @@ -172,7 +172,11 @@ def _load_gqa(config, prefix: str, weights): dim=0, ) - if config.quantize not in ["gptq", "awq"]: + input_scale, weight_scale = None, None + if type(weight) is tuple: + weight, input_scale, weight_scale = weight + + if config.quantize not in ["gptq", "awq", "fp8"]: weight = weight.to(dtype=weights.dtype).to(device=weights.device) head_size = config.hidden_size // config.num_attention_heads @@ -183,7 +187,13 @@ def _load_gqa(config, prefix: str, weights): config.hidden_size, ], f"{list(weight.shape)} != {[(num_heads + 2 * num_key_value_heads) * head_size, config.hidden_size]}" - return TensorParallelColumnLinear(get_linear(weight, bias=None, quantize=config.quantize)) + return TensorParallelColumnLinear(get_linear( + weight, + bias=None, + quantize=config.quantize, + weight_scale=weight_scale, + input_scale=input_scale, + )) def _load_experts(config, prefix, mat, weights): From 9670e449005d6528e380d4310866744191639e81 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 14 Oct 2024 21:30:27 -0700 Subject: [PATCH 07/14] Revert "Revert SOCI" This reverts commit 279e09e381099e9bec71a2a89233c90c47ae5906. --- .github/workflows/build.yaml | 111 ++++++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 1f8752150..753050076 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -8,11 +8,6 @@ on: - 'return-n' tags: - 'v*' - pull_request: - types: [opened, synchronize, reopened] - branches: [ master ] - paths: - - '.github/workflows/build.yaml' jobs: build-and-push-image: @@ -33,7 +28,7 @@ jobs: uses: actions/checkout@v3 with: submodules: recursive - + - name: Free Disk Space (Ubuntu) uses: jlumbroso/free-disk-space@main with: @@ -44,6 +39,23 @@ jobs: large-packages: false swap-storage: true + - name: Install soci + uses: lerentis/soci-installer@v1.0.1 + with: + soci-release: 'v0.4.0' + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2.10.0 + + - name: Set up containerd for ubuntu + uses: crazy-max/ghaction-setup-containerd@v2.2.0 + with: + config-inline: | + version = 2 + + # persistent data location + root = "/runner/build/containerd" + - name: Docker meta id: meta uses: docker/metadata-action@v5 @@ -51,20 +63,89 @@ jobs: images: | ghcr.io/predibase/lorax tags: | - type=raw,value=return-n - + type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }} + + - name: Create a hash from tags + env: + tags: ${{ steps.meta.outputs.tags }} + id: vars + run: | + tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') + echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT + echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT + echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT + echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT + + - name: Create and update image/cache directory + env: + image_dir: ${{ steps.vars.outputs.image_dir }} + cache_dir: ${{ steps.vars.outputs.cache_dir }} + run: | + sudo mkdir -p $image_dir + sudo chown ubuntu:ubuntu $image_dir + + sudo mkdir -p $cache_dir + sudo chown ubuntu:ubuntu $cache_dir + + - name: Export Docker image as OCI + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile # Path to your Dockerfile + push: false + tags: ${{ steps.meta.outputs.tags }} + outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz + cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }} + cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }} + + - name: Import image in containerd + env: + tag_hash: ${{ steps.vars.outputs.tag_hash }} + image_path: ${{ steps.vars.outputs.image_path }} + run: | + echo "Importing $image_path-$tag_hash to Containerd" + sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz + - name: Log in to GitHub Container Registry uses: docker/login-action@v1 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GHCR_PAT }} - - - name: Build and Push Image - uses: docker/build-push-action@v2 - with: - context: . - file: ./Dockerfile # Path to your Dockerfile - push: true + + - name: Push image with containerd + env: + tags: ${{ steps.meta.outputs.tags }} + run: | + for tag in $tags + do + echo "Pushing $tag to GHCR" + sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag + done + + - name: Create and push soci index + env: tags: ${{ steps.meta.outputs.tags }} + run: | + export SOCI_PATH=$HOME/.soci/soci + for tag in $tags + do + echo "Creating soci index for $tag" + sudo $SOCI_PATH create $tag + echo "Pushing soci index for $tag" + sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag + done + + - name: Prune older images + env: + tag_hash: ${{ steps.vars.outputs.tag_hash }} + image_path: ${{ steps.vars.outputs.image_path }} + run: | + # Delete images older than a day from docker store + docker image prune -a -f --filter "until=24h" + + # Delete the on disk copy + rm -rf "$image_path-$tag_hash.tar.gz" + # Delete the SHA image(s) from containerd store + sudo ctr i rm $(sudo ctr i ls -q) From 4832ce825e2f34b16d7be4088ac62812f578d05d Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 14 Oct 2024 21:30:53 -0700 Subject: [PATCH 08/14] Change tag --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 753050076..939d952e2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -63,7 +63,7 @@ jobs: images: | ghcr.io/predibase/lorax tags: | - type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }} + type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }} - name: Create a hash from tags env: From f40fea02215d6492d4266aebb66a4708118da742 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 14 Oct 2024 22:24:36 -0700 Subject: [PATCH 09/14] Fixed return n for chat --- router/src/lib.rs | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index ff51fe7ca..b15612237 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -386,7 +386,7 @@ pub struct SimpleToken { stop: usize, } -#[derive(Serialize, ToSchema)] +#[derive(Serialize, ToSchema, Clone)] #[serde(rename_all(serialize = "snake_case"))] pub(crate) enum FinishReason { #[schema(rename = "length")] @@ -886,21 +886,46 @@ impl From for ChatCompletionResponse { .unwrap_or(0); let total_tokens = prompt_tokens + completion_tokens; + // assign choices as the generated text, and include the best of sequences if available + let mut choices = vec![ChatCompletionResponseChoice { + index: 0, + message: ChatMessage { + role: Some("assistant".to_string()), + content: Some(resp.generated_text), + }, + finish_reason: resp + .details + .as_ref() + .map(|x| CompletionFinishReason::from(x.finish_reason.clone())), + }]; + + choices.extend( + resp.details + .as_ref() + .and_then(|x| x.best_of_sequences.as_ref()) + .into_iter() + .flat_map(|seqs| { + seqs.iter() + .enumerate() + .map(|(index, seq)| ChatCompletionResponseChoice { + index: index as i32 + 1, + message: ChatMessage { + role: Some("assistant".to_string()), + content: Some(seq.generated_text.clone()), + }, + finish_reason: Some(CompletionFinishReason::from( + seq.finish_reason.clone(), + )), + }) + }), + ); + ChatCompletionResponse { id: "null".to_string(), object: "text_completion".to_string(), created: 0, model: "null".to_string(), - choices: vec![ChatCompletionResponseChoice { - index: 0, - message: ChatMessage { - role: Some("assistant".to_string()), - content: Some(resp.generated_text), - }, - finish_reason: resp - .details - .map(|x| CompletionFinishReason::from(x.finish_reason)), - }], + choices: choices, usage: UsageInfo { prompt_tokens: prompt_tokens, total_tokens: total_tokens, From ea8169bb48365b357b7b704b434c0b54dd2ed4a1 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Mon, 14 Oct 2024 22:31:46 -0700 Subject: [PATCH 10/14] Replace model if with empty --- router/src/server.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/router/src/server.rs b/router/src/server.rs index 7a04a6d6f..bb2b7a2d2 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -256,10 +256,17 @@ async fn chat_completions_v1( } }; + let mut adapter_id = Some(req.model.clone()); + if req.model == info.model_id.as_str() { + // Allow user to specify the base model, but treat it as an empty adapter_id + tracing::debug!("Replacing base model {0} with empty adapter_id", req.model); + adapter_id = None; + } + let mut gen_req = CompatGenerateRequest { inputs: inputs.to_string(), parameters: GenerateParameters { - adapter_id: req.model.parse().ok(), + adapter_id: adapter_id, adapter_source: req.adapter_source, adapter_parameters: None, api_token: req.api_token, From 94e0333c37adc2d5d081da8ac192748351b873b9 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 15 Oct 2024 08:51:21 -0700 Subject: [PATCH 11/14] Use extra_body --- router/src/lib.rs | 20 ++++++++++++-------- router/src/server.rs | 21 +++++++++++++++------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index b15612237..7af1531eb 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -562,6 +562,17 @@ impl From for TextMessage { } } +#[derive(Clone, Debug, Deserialize, ToSchema)] +struct ExtraBody { + // TODO(travis): add other LoRAX params here + response_format: Option, + repetition_penalty: Option, + top_k: Option, + ignore_eos_token: Option, + adapter_source: Option, + api_token: Option, +} + #[derive(Clone, Debug, Deserialize, ToSchema)] struct ChatCompletionRequest { model: String, @@ -582,14 +593,7 @@ struct ChatCompletionRequest { #[allow(dead_code)] // For now allow this field even though it is unused user: Option, seed: Option, - // Additional parameters - // TODO(travis): add other LoRAX params here - response_format: Option, - repetition_penalty: Option, - top_k: Option, - ignore_eos_token: Option, - adapter_source: Option, - api_token: Option, + extra_body: Option, } #[derive(Clone, Debug, Deserialize, ToSchema)] diff --git a/router/src/server.rs b/router/src/server.rs index bb2b7a2d2..313c3f6f2 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -267,18 +267,24 @@ async fn chat_completions_v1( inputs: inputs.to_string(), parameters: GenerateParameters { adapter_id: adapter_id, - adapter_source: req.adapter_source, + adapter_source: req + .extra_body + .as_ref() + .and_then(|x| x.adapter_source.clone()), adapter_parameters: None, - api_token: req.api_token, + api_token: req.extra_body.as_ref().and_then(|x| x.api_token.clone()), best_of: req.n.map(|x| x as usize), temperature: req.temperature, - repetition_penalty: req.repetition_penalty, - top_k: req.top_k, + repetition_penalty: req.extra_body.as_ref().and_then(|x| x.repetition_penalty), + top_k: req.extra_body.as_ref().and_then(|x| x.top_k), top_p: req.top_p, typical_p: None, do_sample: !req.n.is_none(), max_new_tokens: req.max_tokens.map(|x| x as u32), - ignore_eos_token: req.ignore_eos_token.unwrap_or(false), + ignore_eos_token: req + .extra_body + .as_ref() + .map_or(false, |x| x.ignore_eos_token.unwrap_or(false)), return_full_text: None, stop: req.stop, truncate: None, @@ -288,7 +294,10 @@ async fn chat_completions_v1( return_k_alternatives: None, apply_chat_template: false, seed: req.seed, - response_format: req.response_format, + response_format: req + .extra_body + .as_ref() + .and_then(|x| x.response_format.clone()), }, stream: req.stream.unwrap_or(false), }; From 96f7130aa7901f43f61188759ac03d461943ea3e Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 15 Oct 2024 08:52:54 -0700 Subject: [PATCH 12/14] Put login first --- .github/workflows/build.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 939d952e2..8026b8832 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -24,6 +24,13 @@ jobs: security-events: write steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_PAT }} + - name: Checkout repository uses: actions/checkout@v3 with: @@ -106,13 +113,6 @@ jobs: echo "Importing $image_path-$tag_hash to Containerd" sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz - - name: Log in to GitHub Container Registry - uses: docker/login-action@v1 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GHCR_PAT }} - - name: Push image with containerd env: tags: ${{ steps.meta.outputs.tags }} From 850988983053dfd9837ddffb61c4b2e8ea228ec7 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 15 Oct 2024 09:33:34 -0700 Subject: [PATCH 13/14] Revert "Use extra_body" This reverts commit 94e0333c37adc2d5d081da8ac192748351b873b9. --- router/src/lib.rs | 20 ++++++++------------ router/src/server.rs | 21 ++++++--------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index 7af1531eb..b15612237 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -562,17 +562,6 @@ impl From for TextMessage { } } -#[derive(Clone, Debug, Deserialize, ToSchema)] -struct ExtraBody { - // TODO(travis): add other LoRAX params here - response_format: Option, - repetition_penalty: Option, - top_k: Option, - ignore_eos_token: Option, - adapter_source: Option, - api_token: Option, -} - #[derive(Clone, Debug, Deserialize, ToSchema)] struct ChatCompletionRequest { model: String, @@ -593,7 +582,14 @@ struct ChatCompletionRequest { #[allow(dead_code)] // For now allow this field even though it is unused user: Option, seed: Option, - extra_body: Option, + // Additional parameters + // TODO(travis): add other LoRAX params here + response_format: Option, + repetition_penalty: Option, + top_k: Option, + ignore_eos_token: Option, + adapter_source: Option, + api_token: Option, } #[derive(Clone, Debug, Deserialize, ToSchema)] diff --git a/router/src/server.rs b/router/src/server.rs index 313c3f6f2..bb2b7a2d2 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -267,24 +267,18 @@ async fn chat_completions_v1( inputs: inputs.to_string(), parameters: GenerateParameters { adapter_id: adapter_id, - adapter_source: req - .extra_body - .as_ref() - .and_then(|x| x.adapter_source.clone()), + adapter_source: req.adapter_source, adapter_parameters: None, - api_token: req.extra_body.as_ref().and_then(|x| x.api_token.clone()), + api_token: req.api_token, best_of: req.n.map(|x| x as usize), temperature: req.temperature, - repetition_penalty: req.extra_body.as_ref().and_then(|x| x.repetition_penalty), - top_k: req.extra_body.as_ref().and_then(|x| x.top_k), + repetition_penalty: req.repetition_penalty, + top_k: req.top_k, top_p: req.top_p, typical_p: None, do_sample: !req.n.is_none(), max_new_tokens: req.max_tokens.map(|x| x as u32), - ignore_eos_token: req - .extra_body - .as_ref() - .map_or(false, |x| x.ignore_eos_token.unwrap_or(false)), + ignore_eos_token: req.ignore_eos_token.unwrap_or(false), return_full_text: None, stop: req.stop, truncate: None, @@ -294,10 +288,7 @@ async fn chat_completions_v1( return_k_alternatives: None, apply_chat_template: false, seed: req.seed, - response_format: req - .extra_body - .as_ref() - .and_then(|x| x.response_format.clone()), + response_format: req.response_format, }, stream: req.stream.unwrap_or(false), }; From 6a2b7fc4afe455ea9303b7058cc31037d0e0cd4b Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Tue, 15 Oct 2024 09:44:56 -0700 Subject: [PATCH 14/14] Revert docker --- .github/workflows/build.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 8026b8832..10b27fbc7 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -5,7 +5,6 @@ on: push: branches: - 'main' - - 'return-n' tags: - 'v*' @@ -70,7 +69,10 @@ jobs: images: | ghcr.io/predibase/lorax tags: | - type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }} + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha,prefix=,suffix=,format=short + type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} - name: Create a hash from tags env: