janhq · vansangpfiev · Dec 5, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/docs/docs/capabilities/embeddings.md b/docs/docs/capabilities/embeddings.md
@@ -6,3 +6,100 @@ title: Embeddings
 :::
 
 cortex.cpp now support embeddings endpoint with fully OpenAI compatible.
+
+
+For embeddings API usage please refer to [API references](/api-reference#tag/chat/POST/v1/embeddings). This tutorial show you how to use embeddings in cortex with openai python SDK.
+
+## Embedding with openai compatible
+
+### 1. Start server and run model
+
+```
+cortex run llama3.1:8b-gguf-q4-km
+```
+
+### 2. Create script `embeddings.py` with this content
+
+```
+from datetime import datetime
+from openai import OpenAI
+from pydantic import BaseModel
+ENDPOINT = "http://localhost:39281/v1"
+MODEL = "llama3.1:8bb-gguf-q4-km"
+client = OpenAI(
+    base_url=ENDPOINT,
+    api_key="not-needed"
+)
+```
+
+### 3. Create embeddings
+
+```
+response = client.embeddings.create(input = "embedding", model=MODEL, encoding_format="base64")
+print(response)
+```
+
+The reponse will be like this
+
+```
+CreateEmbeddingResponse(
+    data=[
+        Embedding(
+            embedding='hjuAPOD8TryuPU8...',
+            index=0,
+            object='embedding'
+        )
+    ],
+    model='meta-llama3.1-8b-instruct',
+    object='list',
+    usage=Usage(
+        prompt_tokens=2,
+        total_tokens=2
+    )
+)
+```
+
+
+The output embeddings is encoded as base64 string. Default the model will output the embeddings in float mode.
+
+```
+response = client.embeddings.create(input = "embedding", model=MODEL)
+print(response)
+```
+
+Result will be
+
+```
+CreateEmbeddingResponse(
+    data=[
+        Embedding(
+            embedding=[0.1, 0.3, 0.4 ....],
+            index=0,
+            object='embedding'
+        )
+    ],
+    model='meta-llama3.1-8b-instruct',
+    object='list',
+    usage=Usage(
+        prompt_tokens=2,
+        total_tokens=2
+    )
+)
+```
+
+Cortex also supports all input types as [OpenAI](https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-input).
+
+```sh
+# input as string
+response = client.embeddings.create(input = "embedding", model=MODEL)
+
+# input as array of string
+response = client.embeddings.create(input = ["embedding"], model=MODEL)
+
+# input as array of tokens
+response = client.embeddings.create(input = [12,44,123], model=MODEL)
+
+# input as array of arrays contain tokens
+response = client.embeddings.create(input = [[912,312,54],[12,433,1241]], model=MODEL)
+```
+
diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
@@ -190,7 +190,7 @@
         ]
       }
     },
-    "v1/embeddings": {
+    "/v1/embeddings": {
       "post": {
         "summary": "Create embeddings",
         "description": "Creates an embedding vector representing the input text.",
@@ -204,22 +204,29 @@
                   "input": {
                     "oneOf": [
                       {
-                        "type": "string"
+                        "type": "string",
+                        "description":"The string that will be turned into an embedding."
                       },
                       {
                         "type": "array",
+                        "description" : "The array of strings that will be turned into an embedding.",
                         "items": {
                           "type": "string"
                         }
                       },
                       {
                         "type": "array",
+                        "description": "The array of integers that will be turned into an embedding.",
                         "items": {
                           "type": "integer"
+
                         }
                       },
                       {
                         "type": "array",
+
+                        "description" : "The array of arrays containing integers that will be turned into an embedding.",
+
                         "items": {
                           "type": "array",
                           "items": {
@@ -290,7 +297,10 @@
               }
             }
           }
-        }
+        },
+        "tags": [
+          "Embeddings"
+        ]
       }
     },
     "/v1/chat/completions": {

diff --git a/engine/config/chat_template_renderer.h b/engine/config/chat_template_renderer.h
@@ -48,10 +48,11 @@
 #include <vector>
 namespace config {
 
-#if (defined(_MSC_VER) && _MSC_VER >= 1900 && defined(__cpp_char8_t)) || __cplusplus >= 202002L
-    #define LU8(x) reinterpret_cast<const char*>(u8##x)
+#if (defined(_MSC_VER) && _MSC_VER >= 1900 && defined(__cpp_char8_t)) || \
+    __cplusplus >= 202002L
+#define LU8(x) reinterpret_cast<const char*>(u8##x)
 #else
-    #define LU8(x) u8##x
+#define LU8(x) u8##x
 #endif
 
 typedef struct llama_chat_message {
@@ -167,13 +168,10 @@ static int32_t llama_chat_apply_template_internal(
     std::string system_prompt = "";
     for (auto message : chat) {
       std::string role(message->role);
-      if (role == "system") {
-        // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
-        system_prompt = trim(message->content);
-        continue;
-      }
       // in gemma, "assistant" is "model"
       role = role == "assistant" ? "model" : message->role;
+      // in gemma2, "system" is "user"
+      role = role == "system" ? "user" : role;
       ss << "<start_of_turn>" << role << "\n";
       if (!system_prompt.empty() && role != "model") {
         ss << system_prompt << "\n\n";