From 9b2c8bbb8278d2bc69e1507dd79af1c0775f9144 Mon Sep 17 00:00:00 2001
From: Louis <louis@jan.ai>
Date: Tue, 11 Jun 2024 14:52:06 +0700
Subject: [PATCH] chore: update models settings (#673)

---
 .../src/domain/models/model.interface.ts      | 78 +++++++++++++--
 .../usecases/models.cli.usecases.ts           | 14 +--
 .../commanders/usecases/ps.cli.usecases.ts    | 60 ++++++------
 .../src/infrastructure/constants/cortex.ts    |  5 +
 .../dtos/chat/create-chat-completion.dto.ts   | 22 +++--
 .../dtos/cortex/start-cortex.dto.ts           |  5 +-
 .../dtos/models/model-settings.dto.ts         | 88 ++++++++++++++++-
 .../infrastructure/dtos/models/model.dto.ts   | 94 ++++++++++++++++++-
 .../providers/cortex/cortex.provider.ts       |  5 +-
 .../src/usecases/threads/threads.usecases.ts  |  1 +
 10 files changed, 317 insertions(+), 55 deletions(-)

diff --git a/cortex-js/src/domain/models/model.interface.ts b/cortex-js/src/domain/models/model.interface.ts
index e42ce2cee..9af4bd618 100644
--- a/cortex-js/src/domain/models/model.interface.ts
+++ b/cortex-js/src/domain/models/model.interface.ts
@@ -1,8 +1,3 @@
-export interface ModelArtifact {
-  mmproj?: string;
-  llama_model_path?: string;
-}
-
 /**
  * Model type defines the shape of a model object.
  * @stored
@@ -90,6 +85,56 @@ export interface Model {
    */
   cpu_threads?: number;
 
+  /**
+   * The prompt to use for internal configuration
+   */
+  pre_prompt?: string;
+
+  /**
+   * The batch size for prompt eval step
+   */
+  n_batch?: number;
+
+  /**
+   * To enable prompt caching or not
+   */
+  caching_enabled?: boolean;
+
+  /**
+   * Group attention factor in self-extend
+   */
+  grp_attn_n?: number;
+
+  /**
+   * Group attention width in self-extend
+   */
+  grp_attn_w?: number;
+
+  /**
+   * Prevent system swapping of the model to disk in macOS
+   */
+  mlock?: boolean;
+
+  /**
+   * You can constrain the sampling using GBNF grammars by providing path to a grammar file
+   */
+  grammar_file?: string;
+
+  /**
+   * To enable Flash Attention, default is true
+   */
+  flash_attn?: boolean;
+
+  /**
+   * KV cache type: f16, q8_0, q4_0, default is f16
+   */
+  cache_type?: string;
+
+  /**
+   * To enable mmap, default is true
+   */
+  use_mmap?: boolean;
+
   /**
    * The model engine.
    */
@@ -112,10 +157,20 @@ export interface ModelSettingParams {
   llama_model_path?: string;
   mmproj?: string;
   cont_batching?: boolean;
-  vision_model?: boolean;
-  text_model?: boolean;
   engine?: string;
   stop?: string[];
+  pre_prompt?: string;
+  n_batch?: number;
+  caching_enabled?: boolean;
+  grp_attn_n?: number;
+  grp_attn_w?: number;
+  mlock?: boolean;
+  grammar_file?: string;
+  model_type?: string;
+  model_alias?: string;
+  flash_attn?: boolean;
+  cache_type?: string;
+  use_mmap?: boolean;
 }
 
 /**
@@ -133,3 +188,12 @@ export interface ModelRuntimeParams {
   presence_penalty?: number;
   engine?: string;
 }
+
+/**
+ * The model artifact object.
+ * In-case the model files is not a raw file list
+ */
+export interface ModelArtifact {
+  mmproj?: string;
+  llama_model_path?: string;
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
index da07499a2..651c383f7 100644
--- a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
+++ b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
@@ -18,7 +18,13 @@ import {
   OPEN_CHAT_3_5_JINJA,
   ZEPHYR,
   ZEPHYR_JINJA,
-} from '../../constants/prompt-constants';
+} from './../../constants/prompt-constants';
+import {
+  HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
+  HUGGING_FACE_REPO_MODEL_API_URL,
+  HUGGING_FACE_REPO_URL,
+  HUGGING_FACE_TREE_REF_URL,
+} from '../../constants/huggingface';
 import { ModelTokenizer } from '../types/model-tokenizer.interface';
 import { HttpService } from '@nestjs/axios';
 import { firstValueFrom } from 'rxjs';
@@ -29,12 +35,6 @@ import { join, basename } from 'path';
 import { load } from 'js-yaml';
 import { existsSync, readdirSync, readFileSync } from 'fs';
 import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id';
-import {
-  HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
-  HUGGING_FACE_REPO_MODEL_API_URL,
-  HUGGING_FACE_REPO_URL,
-  HUGGING_FACE_TREE_REF_URL,
-} from '../../constants/huggingface';
 
 @Injectable()
 export class ModelsCliUsecases {
diff --git a/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts
index 70d20a16b..5e2e8db38 100644
--- a/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts
+++ b/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts
@@ -1,5 +1,11 @@
-import { Injectable } from '@nestjs/common';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import { HttpStatus, Injectable } from '@nestjs/common';
+import {
+  CORTEX_CPP_MODELS_URL,
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
+import { HttpService } from '@nestjs/axios';
+import { firstValueFrom } from 'rxjs';
 
 export interface ModelStat {
   modelId: string;
@@ -15,6 +21,7 @@ interface ModelStatResponse {
 }
 @Injectable()
 export class PSCliUsecases {
+  constructor(private readonly httpService: HttpService) {}
   /**
    * Get models running in the Cortex C++ server
    * @param host Cortex host address
@@ -25,32 +32,31 @@ export class PSCliUsecases {
     port: number = defaultCortexCppPort,
   ): Promise<ModelStat[]> {
     return new Promise<ModelStat[]>((resolve, reject) =>
-      fetch(`http://${host}:${port}/inferences/server/models`)
+      firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port)))
         .then((res) => {
-          if (res.ok) {
-            res
-              .json()
-              .then(({ data }: ModelStatResponse) => {
-                if (data && Array.isArray(data) && data.length > 0) {
-                  resolve(
-                    data.map((e) => {
-                      const startTime = e.start_time ?? new Date();
-                      const currentTime = new Date();
-                      const duration =
-                        currentTime.getTime() - new Date(startTime).getTime();
-                      return {
-                        modelId: e.id,
-                        engine: e.engine ?? 'cortex.llamacpp',
-                        status: 'running',
-                        duration: this.formatDuration(duration),
-                        ram: e.ram ?? '-',
-                        vram: e.vram ?? '-',
-                      };
-                    }),
-                  );
-                } else reject();
-              })
-              .catch(reject);
+          const data = res.data as ModelStatResponse;
+          if (
+            res.status === HttpStatus.OK &&
+            data &&
+            Array.isArray(data.data) &&
+            data.data.length > 0
+          ) {
+            resolve(
+              data.data.map((e) => {
+                const startTime = e.start_time ?? new Date();
+                const currentTime = new Date();
+                const duration =
+                  currentTime.getTime() - new Date(startTime).getTime();
+                return {
+                  modelId: e.id,
+                  engine: e.engine ?? 'cortex.llamacpp',
+                  status: 'running',
+                  duration: this.formatDuration(duration),
+                  ram: e.ram ?? '-',
+                  vram: e.vram ?? '-',
+                };
+              }),
+            );
           } else reject();
         })
         .catch(reject),
diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts
index 6e6b4c400..09707dd1a 100644
--- a/cortex-js/src/infrastructure/constants/cortex.ts
+++ b/cortex-js/src/infrastructure/constants/cortex.ts
@@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = (
   port: number = defaultCortexCppPort,
 ) => `http://${host}:${port}/healthz`;
 
+export const CORTEX_CPP_MODELS_URL = (
+  host: string = defaultCortexCppHost,
+  port: number = defaultCortexCppPort,
+) => `http://${host}:${port}/inferences/server/models`;
+
 // INITIALIZATION
 export const CORTEX_RELEASES_URL =
   'https://api.github.com/repos/janhq/cortex/releases';
diff --git a/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts b/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts
index 5e48edc92..e2ccbc542 100644
--- a/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts
@@ -2,6 +2,7 @@ import {
   IsArray,
   IsBoolean,
   IsNumber,
+  IsOptional,
   IsString,
   ValidateNested,
 } from 'class-validator';
@@ -29,46 +30,53 @@ export class CreateChatCompletionDto {
     description:
       'Determines the format for output generation. If set to `true`, the output is generated continuously, allowing for real-time streaming of responses. If set to `false`, the output is delivered in a single JSON file.',
   })
+  @IsOptional()
   @IsBoolean()
-  stream: boolean;
+  stream?: boolean;
 
   @ApiProperty({
     description:
       'Sets the upper limit on the number of tokens the model can generate in a single output.',
   })
+  @IsOptional()
   @IsNumber()
-  max_tokens: number;
+  max_tokens?: number;
 
   @ApiProperty({
     description:
       'Defines specific tokens or phrases that signal the model to stop producing further output.',
   })
+  @IsOptional()
   @IsArray()
-  stop: string[];
+  stop?: string[];
 
   @ApiProperty({
     description:
       'Modifies the likelihood of the model repeating the same words or phrases within a single output.',
   })
+  @IsOptional()
   @IsNumber()
-  frequency_penalty: number;
+  frequency_penalty?: number;
 
   @ApiProperty({
     description:
       'Reduces the likelihood of repeating tokens, promoting novelty in the output.',
   })
+  @IsOptional()
   @IsNumber()
-  presence_penalty: number;
+  presence_penalty?: number;
 
   @ApiProperty({
     description: "Influences the randomness of the model's output.",
   })
+  @IsOptional()
   @IsNumber()
-  temperature: number;
+  temperature?: number;
 
   @ApiProperty({
     description: 'Sets probability threshold for more relevant outputs.',
   })
+  @IsOptional()
   @IsNumber()
-  top_p: number;
+  top_p?: number;
 }
diff --git a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
index 6a8536bfc..49e143521 100644
--- a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
@@ -1,6 +1,9 @@
 import { ApiProperty } from '@nestjs/swagger';
 import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import {
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
 
 export class StartCortexDto {
   @ApiProperty({
diff --git a/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts b/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts
index 32dffe469..90431b03b 100644
--- a/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts
@@ -1,6 +1,13 @@
 import { ModelSettingParams } from '@/domain/models/model.interface';
 import { ApiProperty } from '@nestjs/swagger';
-import { IsArray, IsNumber, IsOptional, Min } from 'class-validator';
+import {
+  IsArray,
+  IsBoolean,
+  IsNumber,
+  IsOptional,
+  IsString,
+  Min,
+} from 'class-validator';
 
 export class ModelSettingsDto implements ModelSettingParams {
   // Prompt Settings
@@ -47,6 +54,85 @@ export class ModelSettingsDto implements ModelSettingParams {
   @Min(1)
   cpu_threads?: number;
 
+  @ApiProperty({
+    description: 'The prompt to use for internal configuration',
+  })
+  @IsOptional()
+  @IsString()
+  pre_prompt?: string;
+
+  @ApiProperty({
+    description: 'The batch size for prompt eval step',
+    example: 2048,
+  })
+  @IsOptional()
+  @IsNumber()
+  n_batch?: number;
+
+  @ApiProperty({
+    description: 'To enable prompt caching or not',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  caching_enabled?: boolean;
+
+  @ApiProperty({
+    description: 'Group attention factor in self-extend',
+    example: 1,
+  })
+  @IsOptional()
+  @IsNumber()
+  grp_attn_n?: number;
+
+  @ApiProperty({
+    description: 'Group attention width in self-extend',
+    example: 512,
+  })
+  @IsOptional()
+  @IsNumber()
+  grp_attn_w?: number;
+
+  @ApiProperty({
+    description: 'Prevent system swapping of the model to disk in macOS',
+    example: false,
+  })
+  @IsOptional()
+  @IsBoolean()
+  mlock?: boolean;
+
+  @ApiProperty({
+    description:
+      'You can constrain the sampling using GBNF grammars by providing path to a grammar file',
+  })
+  @IsOptional()
+  @IsString()
+  grammar_file?: string;
+
+  @ApiProperty({
+    description: 'To enable Flash Attention, default is true',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  flash_attn?: boolean;
+
+  @ApiProperty({
+    description: 'KV cache type: f16, q8_0, q4_0, default is f16',
+    example: 'f16',
+  })
+  @IsOptional()
+  @IsString()
+  cache_type?: string;
+
+  @ApiProperty({
+    description: 'To enable mmap, default is true',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  use_mmap?: boolean;
+
   @ApiProperty({
     example: 'cortex.llamacpp',
     description: 'The engine to use.',
diff --git a/cortex-js/src/infrastructure/dtos/models/model.dto.ts b/cortex-js/src/infrastructure/dtos/models/model.dto.ts
index 0edecfb47..0c0d3dc96 100644
--- a/cortex-js/src/infrastructure/dtos/models/model.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/models/model.dto.ts
@@ -1,6 +1,12 @@
 import { Model } from '@/domain/models/model.interface';
 import { ApiProperty } from '@nestjs/swagger';
-import { IsArray, IsBoolean, IsNumber, IsOptional } from 'class-validator';
+import {
+  IsArray,
+  IsBoolean,
+  IsNumber,
+  IsOptional,
+  IsString,
+} from 'class-validator';
 
 export class ModelDto implements Partial<Model> {
   @ApiProperty({
@@ -85,7 +91,7 @@ export class ModelDto implements Partial<Model> {
   presence_penalty?: number;
 
   // Engine Settings
-  @ApiProperty({ description: 'Determines GPU layer usage.', example: 4096 })
+  @ApiProperty({ description: 'Determines GPU layer usage.', example: 32 })
   @IsOptional()
   @IsNumber()
   ngl?: number;
@@ -101,15 +107,95 @@ export class ModelDto implements Partial<Model> {
 
   @ApiProperty({
     description:
-      'Determines CPU inference threads, limited by hardware and OS. ',
+      'Determines CPU inference threads, limited by hardware and OS.',
+    example: 10,
   })
   @IsOptional()
   @IsNumber()
   cpu_threads?: number;
 
   @ApiProperty({
-    example: 'cortex.llamacpp',
+    description: 'The prompt to use for internal configuration',
+  })
+  @IsOptional()
+  @IsString()
+  pre_prompt?: string;
+
+  @ApiProperty({
+    description: 'The batch size for prompt eval step',
+    example: 512,
+  })
+  @IsOptional()
+  @IsNumber()
+  n_batch?: number;
+
+  @ApiProperty({
+    description: 'To enable prompt caching or not',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  caching_enabled?: boolean;
+
+  @ApiProperty({
+    description: 'Group attention factor in self-extend',
+    example: 1,
+  })
+  @IsOptional()
+  @IsNumber()
+  grp_attn_n?: number;
+
+  @ApiProperty({
+    description: 'Group attention width in self-extend',
+    example: 512,
+  })
+  @IsOptional()
+  @IsNumber()
+  grp_attn_w?: number;
+
+  @ApiProperty({
+    description: 'Prevent system swapping of the model to disk in macOS',
+    example: false,
+  })
+  @IsOptional()
+  @IsBoolean()
+  mlock?: boolean;
+
+  @ApiProperty({
+    description:
+      'You can constrain the sampling using GBNF grammars by providing path to a grammar file',
+  })
+  @IsOptional()
+  @IsString()
+  grammar_file?: string;
+
+  @ApiProperty({
+    description: 'To enable Flash Attention, default is true',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  flash_attn?: boolean;
+
+  @ApiProperty({
+    description: 'KV cache type: f16, q8_0, q4_0, default is f16',
+    example: 'f16',
+  })
+  @IsOptional()
+  @IsString()
+  cache_type?: string;
+
+  @ApiProperty({
+    description: 'To enable mmap, default is true',
+    example: true,
+  })
+  @IsOptional()
+  @IsBoolean()
+  use_mmap?: boolean;
+
+  @ApiProperty({
     description: 'The engine to use.',
+    example: 'cortex.llamacpp',
   })
   @IsOptional()
   engine?: string;
diff --git a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
index 6d0b26e6b..9c10f75a9 100644
--- a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
+++ b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
@@ -4,7 +4,10 @@ import { PromptTemplate } from '@/domain/models/prompt-template.interface';
 import { join } from 'path';
 import { Model, ModelSettingParams } from '@/domain/models/model.interface';
 import { HttpService } from '@nestjs/axios';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import {
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
 import { readdirSync } from 'node:fs';
 import { normalizeModelId } from '@/infrastructure/commanders/utils/normalize-model-id';
 import { firstValueFrom } from 'rxjs';
diff --git a/cortex-js/src/usecases/threads/threads.usecases.ts b/cortex-js/src/usecases/threads/threads.usecases.ts
index d40b9d9d6..6cf5e0c46 100644
--- a/cortex-js/src/usecases/threads/threads.usecases.ts
+++ b/cortex-js/src/usecases/threads/threads.usecases.ts
@@ -55,6 +55,7 @@ export class ThreadsUsecases {
     order: 'asc' | 'desc',
     after?: string,
     before?: string,
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
     runId?: string,
   ) {
     await this.getThreadOrThrow(threadId);