chore: update models settings (#673)

janhq · Jun 11, 2024 · 9b2c8bb · 9b2c8bb
1 parent f380642
commit 9b2c8bb
Show file tree

Hide file tree

Showing 10 changed files with 317 additions and 55 deletions.
diff --git a/cortex-js/src/domain/models/model.interface.ts b/cortex-js/src/domain/models/model.interface.ts
@@ -1,8 +1,3 @@
-export interface ModelArtifact {
-  mmproj?: string;
-  llama_model_path?: string;
-}
-
 /**
  * Model type defines the shape of a model object.
  * @stored
@@ -90,6 +85,56 @@ export interface Model {
    */
   cpu_threads?: number;
 
+  /**
+   * The prompt to use for internal configuration
+   */
+  pre_prompt?: string;
+
+  /**
+   * The batch size for prompt eval step
+   */
+  n_batch?: number;
+
+  /**
+   * To enable prompt caching or not
+   */
+  caching_enabled?: boolean;
+
+  /**
+   * Group attention factor in self-extend
+   */
+  grp_attn_n?: number;
+
+  /**
+   * Group attention width in self-extend
+   */
+  grp_attn_w?: number;
+
+  /**
+   * Prevent system swapping of the model to disk in macOS
+   */
+  mlock?: boolean;
+
+  /**
+   * You can constrain the sampling using GBNF grammars by providing path to a grammar file
+   */
+  grammar_file?: string;
+
+  /**
+   * To enable Flash Attention, default is true
+   */
+  flash_attn?: boolean;
+
+  /**
+   * KV cache type: f16, q8_0, q4_0, default is f16
+   */
+  cache_type?: string;
+
+  /**
+   * To enable mmap, default is true
+   */
+  use_mmap?: boolean;
+
   /**
    * The model engine.
    */
@@ -112,10 +157,20 @@ export interface ModelSettingParams {
   llama_model_path?: string;
   mmproj?: string;
   cont_batching?: boolean;
-  vision_model?: boolean;
-  text_model?: boolean;
   engine?: string;
   stop?: string[];
+  pre_prompt?: string;
+  n_batch?: number;
+  caching_enabled?: boolean;
+  grp_attn_n?: number;
+  grp_attn_w?: number;
+  mlock?: boolean;
+  grammar_file?: string;
+  model_type?: string;
+  model_alias?: string;
+  flash_attn?: boolean;
+  cache_type?: string;
+  use_mmap?: boolean;
 }
 
 /**
@@ -133,3 +188,12 @@ export interface ModelRuntimeParams {
   presence_penalty?: number;
   engine?: string;
 }
+
+/**
+ * The model artifact object.
+ * In-case the model files is not a raw file list
+ */
+export interface ModelArtifact {
+  mmproj?: string;
+  llama_model_path?: string;
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
@@ -18,7 +18,13 @@ import {
   OPEN_CHAT_3_5_JINJA,
   ZEPHYR,
   ZEPHYR_JINJA,
-} from '../../constants/prompt-constants';
+} from './../../constants/prompt-constants';
+import {
+  HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
+  HUGGING_FACE_REPO_MODEL_API_URL,
+  HUGGING_FACE_REPO_URL,
+  HUGGING_FACE_TREE_REF_URL,
+} from '../../constants/huggingface';
 import { ModelTokenizer } from '../types/model-tokenizer.interface';
 import { HttpService } from '@nestjs/axios';
 import { firstValueFrom } from 'rxjs';
@@ -29,12 +35,6 @@ import { join, basename } from 'path';
 import { load } from 'js-yaml';
 import { existsSync, readdirSync, readFileSync } from 'fs';
 import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id';
-import {
-  HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL,
-  HUGGING_FACE_REPO_MODEL_API_URL,
-  HUGGING_FACE_REPO_URL,
-  HUGGING_FACE_TREE_REF_URL,
-} from '../../constants/huggingface';
 
 @Injectable()
 export class ModelsCliUsecases {

diff --git a/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts
@@ -1,5 +1,11 @@
-import { Injectable } from '@nestjs/common';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import { HttpStatus, Injectable } from '@nestjs/common';
+import {
+  CORTEX_CPP_MODELS_URL,
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
+import { HttpService } from '@nestjs/axios';
+import { firstValueFrom } from 'rxjs';
 
 export interface ModelStat {
   modelId: string;
@@ -15,6 +21,7 @@ interface ModelStatResponse {
 }
 @Injectable()
 export class PSCliUsecases {
+  constructor(private readonly httpService: HttpService) {}
   /**
    * Get models running in the Cortex C++ server
    * @param host Cortex host address
@@ -25,32 +32,31 @@ export class PSCliUsecases {
     port: number = defaultCortexCppPort,
   ): Promise<ModelStat[]> {
     return new Promise<ModelStat[]>((resolve, reject) =>
-      fetch(`http://${host}:${port}/inferences/server/models`)
+      firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port)))
         .then((res) => {
-          if (res.ok) {
-            res
-              .json()
-              .then(({ data }: ModelStatResponse) => {
-                if (data && Array.isArray(data) && data.length > 0) {
-                  resolve(
-                    data.map((e) => {
-                      const startTime = e.start_time ?? new Date();
-                      const currentTime = new Date();
-                      const duration =
-                        currentTime.getTime() - new Date(startTime).getTime();
-                      return {
-                        modelId: e.id,
-                        engine: e.engine ?? 'cortex.llamacpp',
-                        status: 'running',
-                        duration: this.formatDuration(duration),
-                        ram: e.ram ?? '-',
-                        vram: e.vram ?? '-',
-                      };
-                    }),
-                  );
-                } else reject();
-              })
-              .catch(reject);
+          const data = res.data as ModelStatResponse;
+          if (
+            res.status === HttpStatus.OK &&
+            data &&
+            Array.isArray(data.data) &&
+            data.data.length > 0
+          ) {
+            resolve(
+              data.data.map((e) => {
+                const startTime = e.start_time ?? new Date();
+                const currentTime = new Date();
+                const duration =
+                  currentTime.getTime() - new Date(startTime).getTime();
+                return {
+                  modelId: e.id,
+                  engine: e.engine ?? 'cortex.llamacpp',
+                  status: 'running',
+                  duration: this.formatDuration(duration),
+                  ram: e.ram ?? '-',
+                  vram: e.vram ?? '-',
+                };
+              }),
+            );
           } else reject();
         })
         .catch(reject),

diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts
@@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = (
   port: number = defaultCortexCppPort,
 ) => `http://${host}:${port}/healthz`;
 
+export const CORTEX_CPP_MODELS_URL = (
+  host: string = defaultCortexCppHost,
+  port: number = defaultCortexCppPort,
+) => `http://${host}:${port}/inferences/server/models`;
+
 // INITIALIZATION
 export const CORTEX_RELEASES_URL =
   'https://api.github.com/repos/janhq/cortex/releases';

diff --git a/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts b/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts
@@ -2,6 +2,7 @@ import {
   IsArray,
   IsBoolean,
   IsNumber,
+  IsOptional,
   IsString,
   ValidateNested,
 } from 'class-validator';
@@ -29,46 +30,53 @@ export class CreateChatCompletionDto {
     description:
       'Determines the format for output generation. If set to `true`, the output is generated continuously, allowing for real-time streaming of responses. If set to `false`, the output is delivered in a single JSON file.',
   })
+  @IsOptional()
   @IsBoolean()
-  stream: boolean;
+  stream?: boolean;
 
   @ApiProperty({
     description:
       'Sets the upper limit on the number of tokens the model can generate in a single output.',
   })
+  @IsOptional()
   @IsNumber()
-  max_tokens: number;
+  max_tokens?: number;
 
   @ApiProperty({
     description:
       'Defines specific tokens or phrases that signal the model to stop producing further output.',
   })
+  @IsOptional()
   @IsArray()
-  stop: string[];
+  stop?: string[];
 
   @ApiProperty({
     description:
       'Modifies the likelihood of the model repeating the same words or phrases within a single output.',
   })
+  @IsOptional()
   @IsNumber()
-  frequency_penalty: number;
+  frequency_penalty?: number;
 
   @ApiProperty({
     description:
       'Reduces the likelihood of repeating tokens, promoting novelty in the output.',
   })
+  @IsOptional()
   @IsNumber()
-  presence_penalty: number;
+  presence_penalty?: number;
 
   @ApiProperty({
     description: "Influences the randomness of the model's output.",
   })
+  @IsOptional()
   @IsNumber()
-  temperature: number;
+  temperature?: number;
 
   @ApiProperty({
     description: 'Sets probability threshold for more relevant outputs.',
   })
+  @IsOptional()
   @IsNumber()
-  top_p: number;
+  top_p?: number;
 }
diff --git a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
@@ -1,6 +1,9 @@
 import { ApiProperty } from '@nestjs/swagger';
 import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
-import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex';
+import {
+  defaultCortexCppHost,
+  defaultCortexCppPort,
+} from '@/infrastructure/constants/cortex';
 
 export class StartCortexDto {
   @ApiProperty({