From de619670d8f40c504600e0bfa1f3c392c442652e Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Thu, 10 Oct 2024 11:21:26 -0700
Subject: [PATCH 01/14] Fix block allocation for prefix caching

---
 router/src/scheduler.rs | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/router/src/scheduler.rs b/router/src/scheduler.rs
index a847936f0..d8baae290 100644
--- a/router/src/scheduler.rs
+++ b/router/src/scheduler.rs
@@ -370,19 +370,17 @@ impl AdapterSchedulerState {
 
                     // If we're prefix caching, this check could be under-estimating the number of available blocks
                     // due to shared prefixes, so we'll let the block allocator determine whether we have enough space.
-                    if !self.prefix_caching {
-                        if prefill_tokens > prefill_token_budget
-                            || (prefill_tokens + decode_tokens + self.speculate) > token_budget
-                        {
-                            // Entry is over budget
-                            // Add it back to the front
-                            tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
-                            self.queues_state
-                                .lock()
-                                .await
-                                .push_front(&adapter, id, entry);
-                            break;
-                        }
+                    if prefill_tokens > prefill_token_budget
+                        || (prefill_tokens + decode_tokens + self.speculate) > token_budget
+                    {
+                        // Entry is over budget
+                        // Add it back to the front
+                        tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
+                        self.queues_state
+                            .lock()
+                            .await
+                            .push_front(&adapter, id, entry);
+                        break;
                     }
 
                     let tokens = entry.request.input_length()

From 734636c3a2f09f9742f11913cb95385da5c404fa Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Sun, 13 Oct 2024 18:24:14 -0700
Subject: [PATCH 02/14] TEMP: docker

---
 .github/workflows/build.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 56808422d..939d952e2 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -5,6 +5,7 @@ on:
   push:
     branches:
       - 'main'
+      - 'return-n'
     tags:
       - 'v*'
 
@@ -62,10 +63,7 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=,suffix=,format=short
-            type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
+            type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }}
 
       - name: Create a hash from tags
         env:

From 98f9d34f9a5673bf4b0b6fd2b7a361d5fc68e641 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Sun, 13 Oct 2024 18:38:39 -0700
Subject: [PATCH 03/14] Fix

---
 router/src/scheduler.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/router/src/scheduler.rs b/router/src/scheduler.rs
index d8baae290..7777938be 100644
--- a/router/src/scheduler.rs
+++ b/router/src/scheduler.rs
@@ -198,9 +198,6 @@ struct AdapterSchedulerState {
     /// Speculation amount
     speculate: u32,
 
-    /// Prefix caching
-    prefix_caching: bool,
-
     /// Paged Attention Block Allocation
     block_allocator: Option<BlockAllocator>,
 }
@@ -242,7 +239,6 @@ impl AdapterSchedulerState {
             block_size,
             window_size,
             speculate,
-            prefix_caching,
             block_allocator,
         }
     }

From e46e1e66450a635bb17d73b5427fbf51cb0a8370 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Sun, 13 Oct 2024 18:53:09 -0700
Subject: [PATCH 04/14] Change tag

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 939d952e2..753050076 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -63,7 +63,7 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }}
+            type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }}
 
       - name: Create a hash from tags
         env:

From 279e09e381099e9bec71a2a89233c90c47ae5906 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Sun, 13 Oct 2024 19:26:20 -0700
Subject: [PATCH 05/14] Revert SOCI

---
 .github/workflows/build.yaml | 111 +++++------------------------------
 1 file changed, 15 insertions(+), 96 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 753050076..1f8752150 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -8,6 +8,11 @@ on:
       - 'return-n'
     tags:
       - 'v*'
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches: [ master ]
+    paths:
+      - '.github/workflows/build.yaml'
 
 jobs:
   build-and-push-image:
@@ -28,7 +33,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: recursive
-
+      
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@main
         with:
@@ -39,23 +44,6 @@ jobs:
           large-packages: false
           swap-storage: true
 
-      - name: Install soci
-        uses: lerentis/soci-installer@v1.0.1
-        with:
-          soci-release: 'v0.4.0'
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2.10.0
-
-      - name: Set up containerd for ubuntu
-        uses: crazy-max/ghaction-setup-containerd@v2.2.0
-        with:
-          config-inline: |
-            version = 2
-
-            # persistent data location
-            root = "/runner/build/containerd"
-
       - name: Docker meta
         id: meta
         uses: docker/metadata-action@v5
@@ -63,89 +51,20 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }}
-
-      - name: Create a hash from tags
-        env:
-          tags: ${{ steps.meta.outputs.tags }}
-        id: vars
-        run: |
-          tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
-          echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
-          echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
-          echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT
-          echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT
-
-      - name: Create and update image/cache directory
-        env:
-          image_dir: ${{ steps.vars.outputs.image_dir }}
-          cache_dir: ${{ steps.vars.outputs.cache_dir }}
-        run: |
-          sudo mkdir -p $image_dir
-          sudo chown ubuntu:ubuntu $image_dir
-
-          sudo mkdir -p $cache_dir
-          sudo chown ubuntu:ubuntu $cache_dir
-
-      - name: Export Docker image as OCI
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile  # Path to your Dockerfile
-          push: false
-          tags: ${{ steps.meta.outputs.tags }}
-          outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
-          cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }}
-          cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }}
-
-      - name: Import image in containerd
-        env:
-          tag_hash: ${{ steps.vars.outputs.tag_hash }}
-          image_path: ${{ steps.vars.outputs.image_path }}
-        run: |
-          echo "Importing $image_path-$tag_hash to Containerd"
-          sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
-
+            type=raw,value=return-n
+      
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v1
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GHCR_PAT }}
-
-      - name: Push image with containerd
-        env:
-          tags: ${{ steps.meta.outputs.tags }}
-        run: |
-          for tag in $tags
-          do
-            echo "Pushing $tag to GHCR"
-            sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
-          done
-
-      - name: Create and push soci index
-        env:
+      
+      - name: Build and Push Image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          file: ./Dockerfile  # Path to your Dockerfile
+          push: true
           tags: ${{ steps.meta.outputs.tags }}
-        run: |
-          export SOCI_PATH=$HOME/.soci/soci
-          for tag in $tags
-          do
-            echo "Creating soci index for $tag"
-            sudo $SOCI_PATH create $tag
-            echo "Pushing soci index for $tag"
-            sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag
-          done
-
-      - name: Prune older images
-        env:
-          tag_hash: ${{ steps.vars.outputs.tag_hash }}
-          image_path: ${{ steps.vars.outputs.image_path }}
-        run: |
-          # Delete images older than a day from docker store
-          docker image prune -a -f --filter "until=24h"
-
-          # Delete the on disk copy
-          rm -rf "$image_path-$tag_hash.tar.gz"
 
-          # Delete the SHA image(s) from containerd store
-          sudo ctr i rm $(sudo ctr i ls -q)

From 3f6cd52dc21fb3dc9f3c52a9c1c392ed1dea490d Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Sun, 13 Oct 2024 21:41:29 -0700
Subject: [PATCH 06/14] Fix mixtral fp8

---
 .../custom_modeling/flash_mixtral_modeling.py      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py b/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py
index 0455788a0..2a7622c99 100644
--- a/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -172,7 +172,11 @@ def _load_gqa(config, prefix: str, weights):
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq"]:
+    input_scale, weight_scale = None, None
+    if type(weight) is tuple:
+        weight, input_scale, weight_scale = weight
+
+    if config.quantize not in ["gptq", "awq", "fp8"]:
         weight = weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.hidden_size // config.num_attention_heads
@@ -183,7 +187,13 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
         ], f"{list(weight.shape)} != {[(num_heads + 2 * num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=None, quantize=config.quantize))
+    return TensorParallelColumnLinear(get_linear(
+        weight, 
+        bias=None, 
+        quantize=config.quantize, 
+        weight_scale=weight_scale,
+        input_scale=input_scale,
+    ))
 
 
 def _load_experts(config, prefix, mat, weights):

From 9670e449005d6528e380d4310866744191639e81 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Mon, 14 Oct 2024 21:30:27 -0700
Subject: [PATCH 07/14] Revert "Revert SOCI"

This reverts commit 279e09e381099e9bec71a2a89233c90c47ae5906.
---
 .github/workflows/build.yaml | 111 ++++++++++++++++++++++++++++++-----
 1 file changed, 96 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1f8752150..753050076 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -8,11 +8,6 @@ on:
       - 'return-n'
     tags:
       - 'v*'
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches: [ master ]
-    paths:
-      - '.github/workflows/build.yaml'
 
 jobs:
   build-and-push-image:
@@ -33,7 +28,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: recursive
-      
+
       - name: Free Disk Space (Ubuntu)
         uses: jlumbroso/free-disk-space@main
         with:
@@ -44,6 +39,23 @@ jobs:
           large-packages: false
           swap-storage: true
 
+      - name: Install soci
+        uses: lerentis/soci-installer@v1.0.1
+        with:
+          soci-release: 'v0.4.0'
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2.10.0
+
+      - name: Set up containerd for ubuntu
+        uses: crazy-max/ghaction-setup-containerd@v2.2.0
+        with:
+          config-inline: |
+            version = 2
+
+            # persistent data location
+            root = "/runner/build/containerd"
+
       - name: Docker meta
         id: meta
         uses: docker/metadata-action@v5
@@ -51,20 +63,89 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=raw,value=return-n
-      
+            type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }}
+
+      - name: Create a hash from tags
+        env:
+          tags: ${{ steps.meta.outputs.tags }}
+        id: vars
+        run: |
+          tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
+          echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
+          echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
+          echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT
+          echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT
+
+      - name: Create and update image/cache directory
+        env:
+          image_dir: ${{ steps.vars.outputs.image_dir }}
+          cache_dir: ${{ steps.vars.outputs.cache_dir }}
+        run: |
+          sudo mkdir -p $image_dir
+          sudo chown ubuntu:ubuntu $image_dir
+
+          sudo mkdir -p $cache_dir
+          sudo chown ubuntu:ubuntu $cache_dir
+
+      - name: Export Docker image as OCI
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile  # Path to your Dockerfile
+          push: false
+          tags: ${{ steps.meta.outputs.tags }}
+          outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
+          cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }}
+          cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }}
+
+      - name: Import image in containerd
+        env:
+          tag_hash: ${{ steps.vars.outputs.tag_hash }}
+          image_path: ${{ steps.vars.outputs.image_path }}
+        run: |
+          echo "Importing $image_path-$tag_hash to Containerd"
+          sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
+
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v1
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GHCR_PAT }}
-      
-      - name: Build and Push Image
-        uses: docker/build-push-action@v2
-        with:
-          context: .
-          file: ./Dockerfile  # Path to your Dockerfile
-          push: true
+
+      - name: Push image with containerd
+        env:
+          tags: ${{ steps.meta.outputs.tags }}
+        run: |
+          for tag in $tags
+          do
+            echo "Pushing $tag to GHCR"
+            sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
+          done
+
+      - name: Create and push soci index
+        env:
           tags: ${{ steps.meta.outputs.tags }}
+        run: |
+          export SOCI_PATH=$HOME/.soci/soci
+          for tag in $tags
+          do
+            echo "Creating soci index for $tag"
+            sudo $SOCI_PATH create $tag
+            echo "Pushing soci index for $tag"
+            sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag
+          done
+
+      - name: Prune older images
+        env:
+          tag_hash: ${{ steps.vars.outputs.tag_hash }}
+          image_path: ${{ steps.vars.outputs.image_path }}
+        run: |
+          # Delete images older than a day from docker store
+          docker image prune -a -f --filter "until=24h"
+
+          # Delete the on disk copy
+          rm -rf "$image_path-$tag_hash.tar.gz"
 
+          # Delete the SHA image(s) from containerd store
+          sudo ctr i rm $(sudo ctr i ls -q)

From 4832ce825e2f34b16d7be4088ac62812f578d05d Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Mon, 14 Oct 2024 21:30:53 -0700
Subject: [PATCH 08/14] Change tag

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 753050076..939d952e2 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -63,7 +63,7 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=raw,value=returnn,enable=${{ github.ref == 'refs/heads/return-n' }}
+            type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }}
 
       - name: Create a hash from tags
         env:

From f40fea02215d6492d4266aebb66a4708118da742 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Mon, 14 Oct 2024 22:24:36 -0700
Subject: [PATCH 09/14] Fixed return n for chat

---
 router/src/lib.rs | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index ff51fe7ca..b15612237 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -386,7 +386,7 @@ pub struct SimpleToken {
     stop: usize,
 }
 
-#[derive(Serialize, ToSchema)]
+#[derive(Serialize, ToSchema, Clone)]
 #[serde(rename_all(serialize = "snake_case"))]
 pub(crate) enum FinishReason {
     #[schema(rename = "length")]
@@ -886,21 +886,46 @@ impl From<GenerateResponse> for ChatCompletionResponse {
             .unwrap_or(0);
         let total_tokens = prompt_tokens + completion_tokens;
 
+        // assign choices as the generated text, and include the best of sequences if available
+        let mut choices = vec![ChatCompletionResponseChoice {
+            index: 0,
+            message: ChatMessage {
+                role: Some("assistant".to_string()),
+                content: Some(resp.generated_text),
+            },
+            finish_reason: resp
+                .details
+                .as_ref()
+                .map(|x| CompletionFinishReason::from(x.finish_reason.clone())),
+        }];
+
+        choices.extend(
+            resp.details
+                .as_ref()
+                .and_then(|x| x.best_of_sequences.as_ref())
+                .into_iter()
+                .flat_map(|seqs| {
+                    seqs.iter()
+                        .enumerate()
+                        .map(|(index, seq)| ChatCompletionResponseChoice {
+                            index: index as i32 + 1,
+                            message: ChatMessage {
+                                role: Some("assistant".to_string()),
+                                content: Some(seq.generated_text.clone()),
+                            },
+                            finish_reason: Some(CompletionFinishReason::from(
+                                seq.finish_reason.clone(),
+                            )),
+                        })
+                }),
+        );
+
         ChatCompletionResponse {
             id: "null".to_string(),
             object: "text_completion".to_string(),
             created: 0,
             model: "null".to_string(),
-            choices: vec![ChatCompletionResponseChoice {
-                index: 0,
-                message: ChatMessage {
-                    role: Some("assistant".to_string()),
-                    content: Some(resp.generated_text),
-                },
-                finish_reason: resp
-                    .details
-                    .map(|x| CompletionFinishReason::from(x.finish_reason)),
-            }],
+            choices: choices,
             usage: UsageInfo {
                 prompt_tokens: prompt_tokens,
                 total_tokens: total_tokens,

From ea8169bb48365b357b7b704b434c0b54dd2ed4a1 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Mon, 14 Oct 2024 22:31:46 -0700
Subject: [PATCH 10/14] Replace model if with empty

---
 router/src/server.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/router/src/server.rs b/router/src/server.rs
index 7a04a6d6f..bb2b7a2d2 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -256,10 +256,17 @@ async fn chat_completions_v1(
         }
     };
 
+    let mut adapter_id = Some(req.model.clone());
+    if req.model == info.model_id.as_str() {
+        // Allow user to specify the base model, but treat it as an empty adapter_id
+        tracing::debug!("Replacing base model {0} with empty adapter_id", req.model);
+        adapter_id = None;
+    }
+
     let mut gen_req = CompatGenerateRequest {
         inputs: inputs.to_string(),
         parameters: GenerateParameters {
-            adapter_id: req.model.parse().ok(),
+            adapter_id: adapter_id,
             adapter_source: req.adapter_source,
             adapter_parameters: None,
             api_token: req.api_token,

From 94e0333c37adc2d5d081da8ac192748351b873b9 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Tue, 15 Oct 2024 08:51:21 -0700
Subject: [PATCH 11/14] Use extra_body

---
 router/src/lib.rs    | 20 ++++++++++++--------
 router/src/server.rs | 21 +++++++++++++++------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index b15612237..7af1531eb 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -562,6 +562,17 @@ impl From<Message> for TextMessage {
     }
 }
 
+#[derive(Clone, Debug, Deserialize, ToSchema)]
+struct ExtraBody {
+    // TODO(travis): add other LoRAX params here
+    response_format: Option<ResponseFormat>,
+    repetition_penalty: Option<f32>,
+    top_k: Option<i32>,
+    ignore_eos_token: Option<bool>,
+    adapter_source: Option<String>,
+    api_token: Option<String>,
+}
+
 #[derive(Clone, Debug, Deserialize, ToSchema)]
 struct ChatCompletionRequest {
     model: String,
@@ -582,14 +593,7 @@ struct ChatCompletionRequest {
     #[allow(dead_code)] // For now allow this field even though it is unused
     user: Option<String>,
     seed: Option<u64>,
-    // Additional parameters
-    // TODO(travis): add other LoRAX params here
-    response_format: Option<ResponseFormat>,
-    repetition_penalty: Option<f32>,
-    top_k: Option<i32>,
-    ignore_eos_token: Option<bool>,
-    adapter_source: Option<String>,
-    api_token: Option<String>,
+    extra_body: Option<ExtraBody>,
 }
 
 #[derive(Clone, Debug, Deserialize, ToSchema)]
diff --git a/router/src/server.rs b/router/src/server.rs
index bb2b7a2d2..313c3f6f2 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -267,18 +267,24 @@ async fn chat_completions_v1(
         inputs: inputs.to_string(),
         parameters: GenerateParameters {
             adapter_id: adapter_id,
-            adapter_source: req.adapter_source,
+            adapter_source: req
+                .extra_body
+                .as_ref()
+                .and_then(|x| x.adapter_source.clone()),
             adapter_parameters: None,
-            api_token: req.api_token,
+            api_token: req.extra_body.as_ref().and_then(|x| x.api_token.clone()),
             best_of: req.n.map(|x| x as usize),
             temperature: req.temperature,
-            repetition_penalty: req.repetition_penalty,
-            top_k: req.top_k,
+            repetition_penalty: req.extra_body.as_ref().and_then(|x| x.repetition_penalty),
+            top_k: req.extra_body.as_ref().and_then(|x| x.top_k),
             top_p: req.top_p,
             typical_p: None,
             do_sample: !req.n.is_none(),
             max_new_tokens: req.max_tokens.map(|x| x as u32),
-            ignore_eos_token: req.ignore_eos_token.unwrap_or(false),
+            ignore_eos_token: req
+                .extra_body
+                .as_ref()
+                .map_or(false, |x| x.ignore_eos_token.unwrap_or(false)),
             return_full_text: None,
             stop: req.stop,
             truncate: None,
@@ -288,7 +294,10 @@ async fn chat_completions_v1(
             return_k_alternatives: None,
             apply_chat_template: false,
             seed: req.seed,
-            response_format: req.response_format,
+            response_format: req
+                .extra_body
+                .as_ref()
+                .and_then(|x| x.response_format.clone()),
         },
         stream: req.stream.unwrap_or(false),
     };

From 96f7130aa7901f43f61188759ac03d461943ea3e Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Tue, 15 Oct 2024 08:52:54 -0700
Subject: [PATCH 12/14] Put login first

---
 .github/workflows/build.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 939d952e2..8026b8832 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -24,6 +24,13 @@ jobs:
       security-events: write
 
     steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GHCR_PAT }}
+      
       - name: Checkout repository
         uses: actions/checkout@v3
         with:
@@ -106,13 +113,6 @@ jobs:
           echo "Importing $image_path-$tag_hash to Containerd"
           sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
 
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v1
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GHCR_PAT }}
-
       - name: Push image with containerd
         env:
           tags: ${{ steps.meta.outputs.tags }}

From 850988983053dfd9837ddffb61c4b2e8ea228ec7 Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Tue, 15 Oct 2024 09:33:34 -0700
Subject: [PATCH 13/14] Revert "Use extra_body"

This reverts commit 94e0333c37adc2d5d081da8ac192748351b873b9.
---
 router/src/lib.rs    | 20 ++++++++------------
 router/src/server.rs | 21 ++++++---------------
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index 7af1531eb..b15612237 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -562,17 +562,6 @@ impl From<Message> for TextMessage {
     }
 }
 
-#[derive(Clone, Debug, Deserialize, ToSchema)]
-struct ExtraBody {
-    // TODO(travis): add other LoRAX params here
-    response_format: Option<ResponseFormat>,
-    repetition_penalty: Option<f32>,
-    top_k: Option<i32>,
-    ignore_eos_token: Option<bool>,
-    adapter_source: Option<String>,
-    api_token: Option<String>,
-}
-
 #[derive(Clone, Debug, Deserialize, ToSchema)]
 struct ChatCompletionRequest {
     model: String,
@@ -593,7 +582,14 @@ struct ChatCompletionRequest {
     #[allow(dead_code)] // For now allow this field even though it is unused
     user: Option<String>,
     seed: Option<u64>,
-    extra_body: Option<ExtraBody>,
+    // Additional parameters
+    // TODO(travis): add other LoRAX params here
+    response_format: Option<ResponseFormat>,
+    repetition_penalty: Option<f32>,
+    top_k: Option<i32>,
+    ignore_eos_token: Option<bool>,
+    adapter_source: Option<String>,
+    api_token: Option<String>,
 }
 
 #[derive(Clone, Debug, Deserialize, ToSchema)]
diff --git a/router/src/server.rs b/router/src/server.rs
index 313c3f6f2..bb2b7a2d2 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -267,24 +267,18 @@ async fn chat_completions_v1(
         inputs: inputs.to_string(),
         parameters: GenerateParameters {
             adapter_id: adapter_id,
-            adapter_source: req
-                .extra_body
-                .as_ref()
-                .and_then(|x| x.adapter_source.clone()),
+            adapter_source: req.adapter_source,
             adapter_parameters: None,
-            api_token: req.extra_body.as_ref().and_then(|x| x.api_token.clone()),
+            api_token: req.api_token,
             best_of: req.n.map(|x| x as usize),
             temperature: req.temperature,
-            repetition_penalty: req.extra_body.as_ref().and_then(|x| x.repetition_penalty),
-            top_k: req.extra_body.as_ref().and_then(|x| x.top_k),
+            repetition_penalty: req.repetition_penalty,
+            top_k: req.top_k,
             top_p: req.top_p,
             typical_p: None,
             do_sample: !req.n.is_none(),
             max_new_tokens: req.max_tokens.map(|x| x as u32),
-            ignore_eos_token: req
-                .extra_body
-                .as_ref()
-                .map_or(false, |x| x.ignore_eos_token.unwrap_or(false)),
+            ignore_eos_token: req.ignore_eos_token.unwrap_or(false),
             return_full_text: None,
             stop: req.stop,
             truncate: None,
@@ -294,10 +288,7 @@ async fn chat_completions_v1(
             return_k_alternatives: None,
             apply_chat_template: false,
             seed: req.seed,
-            response_format: req
-                .extra_body
-                .as_ref()
-                .and_then(|x| x.response_format.clone()),
+            response_format: req.response_format,
         },
         stream: req.stream.unwrap_or(false),
     };

From 6a2b7fc4afe455ea9303b7058cc31037d0e0cd4b Mon Sep 17 00:00:00 2001
From: Travis Addair <travis@predibase.com>
Date: Tue, 15 Oct 2024 09:44:56 -0700
Subject: [PATCH 14/14] Revert docker

---
 .github/workflows/build.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 8026b8832..10b27fbc7 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -5,7 +5,6 @@ on:
   push:
     branches:
       - 'main'
-      - 'return-n'
     tags:
       - 'v*'
 
@@ -70,7 +69,10 @@ jobs:
           images: |
             ghcr.io/predibase/lorax
           tags: |
-            type=raw,value=return-n,enable=${{ github.ref == 'refs/heads/return-n' }}
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=sha,prefix=,suffix=,format=short
+            type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
 
       - name: Create a hash from tags
         env: