From 9b2c8bbb8278d2bc69e1507dd79af1c0775f9144 Mon Sep 17 00:00:00 2001 From: Louis Date: Tue, 11 Jun 2024 14:52:06 +0700 Subject: [PATCH] chore: update models settings (#673) --- .../src/domain/models/model.interface.ts | 78 +++++++++++++-- .../usecases/models.cli.usecases.ts | 14 +-- .../commanders/usecases/ps.cli.usecases.ts | 60 ++++++------ .../src/infrastructure/constants/cortex.ts | 5 + .../dtos/chat/create-chat-completion.dto.ts | 22 +++-- .../dtos/cortex/start-cortex.dto.ts | 5 +- .../dtos/models/model-settings.dto.ts | 88 ++++++++++++++++- .../infrastructure/dtos/models/model.dto.ts | 94 ++++++++++++++++++- .../providers/cortex/cortex.provider.ts | 5 +- .../src/usecases/threads/threads.usecases.ts | 1 + 10 files changed, 317 insertions(+), 55 deletions(-) diff --git a/cortex-js/src/domain/models/model.interface.ts b/cortex-js/src/domain/models/model.interface.ts index e42ce2cee..9af4bd618 100644 --- a/cortex-js/src/domain/models/model.interface.ts +++ b/cortex-js/src/domain/models/model.interface.ts @@ -1,8 +1,3 @@ -export interface ModelArtifact { - mmproj?: string; - llama_model_path?: string; -} - /** * Model type defines the shape of a model object. * @stored @@ -90,6 +85,56 @@ export interface Model { */ cpu_threads?: number; + /** + * The prompt to use for internal configuration + */ + pre_prompt?: string; + + /** + * The batch size for prompt eval step + */ + n_batch?: number; + + /** + * To enable prompt caching or not + */ + caching_enabled?: boolean; + + /** + * Group attention factor in self-extend + */ + grp_attn_n?: number; + + /** + * Group attention width in self-extend + */ + grp_attn_w?: number; + + /** + * Prevent system swapping of the model to disk in macOS + */ + mlock?: boolean; + + /** + * You can constrain the sampling using GBNF grammars by providing path to a grammar file + */ + grammar_file?: string; + + /** + * To enable Flash Attention, default is true + */ + flash_attn?: boolean; + + /** + * KV cache type: f16, q8_0, q4_0, default is f16 + */ + cache_type?: string; + + /** + * To enable mmap, default is true + */ + use_mmap?: boolean; + /** * The model engine. */ @@ -112,10 +157,20 @@ export interface ModelSettingParams { llama_model_path?: string; mmproj?: string; cont_batching?: boolean; - vision_model?: boolean; - text_model?: boolean; engine?: string; stop?: string[]; + pre_prompt?: string; + n_batch?: number; + caching_enabled?: boolean; + grp_attn_n?: number; + grp_attn_w?: number; + mlock?: boolean; + grammar_file?: string; + model_type?: string; + model_alias?: string; + flash_attn?: boolean; + cache_type?: string; + use_mmap?: boolean; } /** @@ -133,3 +188,12 @@ export interface ModelRuntimeParams { presence_penalty?: number; engine?: string; } + +/** + * The model artifact object. + * In-case the model files is not a raw file list + */ +export interface ModelArtifact { + mmproj?: string; + llama_model_path?: string; +} diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts index da07499a2..651c383f7 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts @@ -18,7 +18,13 @@ import { OPEN_CHAT_3_5_JINJA, ZEPHYR, ZEPHYR_JINJA, -} from '../../constants/prompt-constants'; +} from './../../constants/prompt-constants'; +import { + HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL, + HUGGING_FACE_REPO_MODEL_API_URL, + HUGGING_FACE_REPO_URL, + HUGGING_FACE_TREE_REF_URL, +} from '../../constants/huggingface'; import { ModelTokenizer } from '../types/model-tokenizer.interface'; import { HttpService } from '@nestjs/axios'; import { firstValueFrom } from 'rxjs'; @@ -29,12 +35,6 @@ import { join, basename } from 'path'; import { load } from 'js-yaml'; import { existsSync, readdirSync, readFileSync } from 'fs'; import { isLocalModel, normalizeModelId } from '../utils/normalize-model-id'; -import { - HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL, - HUGGING_FACE_REPO_MODEL_API_URL, - HUGGING_FACE_REPO_URL, - HUGGING_FACE_TREE_REF_URL, -} from '../../constants/huggingface'; @Injectable() export class ModelsCliUsecases { diff --git a/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts index 70d20a16b..5e2e8db38 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/ps.cli.usecases.ts @@ -1,5 +1,11 @@ -import { Injectable } from '@nestjs/common'; -import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex'; +import { HttpStatus, Injectable } from '@nestjs/common'; +import { + CORTEX_CPP_MODELS_URL, + defaultCortexCppHost, + defaultCortexCppPort, +} from '@/infrastructure/constants/cortex'; +import { HttpService } from '@nestjs/axios'; +import { firstValueFrom } from 'rxjs'; export interface ModelStat { modelId: string; @@ -15,6 +21,7 @@ interface ModelStatResponse { } @Injectable() export class PSCliUsecases { + constructor(private readonly httpService: HttpService) {} /** * Get models running in the Cortex C++ server * @param host Cortex host address @@ -25,32 +32,31 @@ export class PSCliUsecases { port: number = defaultCortexCppPort, ): Promise { return new Promise((resolve, reject) => - fetch(`http://${host}:${port}/inferences/server/models`) + firstValueFrom(this.httpService.get(CORTEX_CPP_MODELS_URL(host, port))) .then((res) => { - if (res.ok) { - res - .json() - .then(({ data }: ModelStatResponse) => { - if (data && Array.isArray(data) && data.length > 0) { - resolve( - data.map((e) => { - const startTime = e.start_time ?? new Date(); - const currentTime = new Date(); - const duration = - currentTime.getTime() - new Date(startTime).getTime(); - return { - modelId: e.id, - engine: e.engine ?? 'cortex.llamacpp', - status: 'running', - duration: this.formatDuration(duration), - ram: e.ram ?? '-', - vram: e.vram ?? '-', - }; - }), - ); - } else reject(); - }) - .catch(reject); + const data = res.data as ModelStatResponse; + if ( + res.status === HttpStatus.OK && + data && + Array.isArray(data.data) && + data.data.length > 0 + ) { + resolve( + data.data.map((e) => { + const startTime = e.start_time ?? new Date(); + const currentTime = new Date(); + const duration = + currentTime.getTime() - new Date(startTime).getTime(); + return { + modelId: e.id, + engine: e.engine ?? 'cortex.llamacpp', + status: 'running', + duration: this.formatDuration(duration), + ram: e.ram ?? '-', + vram: e.vram ?? '-', + }; + }), + ); } else reject(); }) .catch(reject), diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts index 6e6b4c400..09707dd1a 100644 --- a/cortex-js/src/infrastructure/constants/cortex.ts +++ b/cortex-js/src/infrastructure/constants/cortex.ts @@ -23,6 +23,11 @@ export const CORTEX_CPP_HEALTH_Z_URL = ( port: number = defaultCortexCppPort, ) => `http://${host}:${port}/healthz`; +export const CORTEX_CPP_MODELS_URL = ( + host: string = defaultCortexCppHost, + port: number = defaultCortexCppPort, +) => `http://${host}:${port}/inferences/server/models`; + // INITIALIZATION export const CORTEX_RELEASES_URL = 'https://api.github.com/repos/janhq/cortex/releases'; diff --git a/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts b/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts index 5e48edc92..e2ccbc542 100644 --- a/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts +++ b/cortex-js/src/infrastructure/dtos/chat/create-chat-completion.dto.ts @@ -2,6 +2,7 @@ import { IsArray, IsBoolean, IsNumber, + IsOptional, IsString, ValidateNested, } from 'class-validator'; @@ -29,46 +30,53 @@ export class CreateChatCompletionDto { description: 'Determines the format for output generation. If set to `true`, the output is generated continuously, allowing for real-time streaming of responses. If set to `false`, the output is delivered in a single JSON file.', }) + @IsOptional() @IsBoolean() - stream: boolean; + stream?: boolean; @ApiProperty({ description: 'Sets the upper limit on the number of tokens the model can generate in a single output.', }) + @IsOptional() @IsNumber() - max_tokens: number; + max_tokens?: number; @ApiProperty({ description: 'Defines specific tokens or phrases that signal the model to stop producing further output.', }) + @IsOptional() @IsArray() - stop: string[]; + stop?: string[]; @ApiProperty({ description: 'Modifies the likelihood of the model repeating the same words or phrases within a single output.', }) + @IsOptional() @IsNumber() - frequency_penalty: number; + frequency_penalty?: number; @ApiProperty({ description: 'Reduces the likelihood of repeating tokens, promoting novelty in the output.', }) + @IsOptional() @IsNumber() - presence_penalty: number; + presence_penalty?: number; @ApiProperty({ description: "Influences the randomness of the model's output.", }) + @IsOptional() @IsNumber() - temperature: number; + temperature?: number; @ApiProperty({ description: 'Sets probability threshold for more relevant outputs.', }) + @IsOptional() @IsNumber() - top_p: number; + top_p?: number; } diff --git a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts index 6a8536bfc..49e143521 100644 --- a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts +++ b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts @@ -1,6 +1,9 @@ import { ApiProperty } from '@nestjs/swagger'; import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator'; -import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex'; +import { + defaultCortexCppHost, + defaultCortexCppPort, +} from '@/infrastructure/constants/cortex'; export class StartCortexDto { @ApiProperty({ diff --git a/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts b/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts index 32dffe469..90431b03b 100644 --- a/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts +++ b/cortex-js/src/infrastructure/dtos/models/model-settings.dto.ts @@ -1,6 +1,13 @@ import { ModelSettingParams } from '@/domain/models/model.interface'; import { ApiProperty } from '@nestjs/swagger'; -import { IsArray, IsNumber, IsOptional, Min } from 'class-validator'; +import { + IsArray, + IsBoolean, + IsNumber, + IsOptional, + IsString, + Min, +} from 'class-validator'; export class ModelSettingsDto implements ModelSettingParams { // Prompt Settings @@ -47,6 +54,85 @@ export class ModelSettingsDto implements ModelSettingParams { @Min(1) cpu_threads?: number; + @ApiProperty({ + description: 'The prompt to use for internal configuration', + }) + @IsOptional() + @IsString() + pre_prompt?: string; + + @ApiProperty({ + description: 'The batch size for prompt eval step', + example: 2048, + }) + @IsOptional() + @IsNumber() + n_batch?: number; + + @ApiProperty({ + description: 'To enable prompt caching or not', + example: true, + }) + @IsOptional() + @IsBoolean() + caching_enabled?: boolean; + + @ApiProperty({ + description: 'Group attention factor in self-extend', + example: 1, + }) + @IsOptional() + @IsNumber() + grp_attn_n?: number; + + @ApiProperty({ + description: 'Group attention width in self-extend', + example: 512, + }) + @IsOptional() + @IsNumber() + grp_attn_w?: number; + + @ApiProperty({ + description: 'Prevent system swapping of the model to disk in macOS', + example: false, + }) + @IsOptional() + @IsBoolean() + mlock?: boolean; + + @ApiProperty({ + description: + 'You can constrain the sampling using GBNF grammars by providing path to a grammar file', + }) + @IsOptional() + @IsString() + grammar_file?: string; + + @ApiProperty({ + description: 'To enable Flash Attention, default is true', + example: true, + }) + @IsOptional() + @IsBoolean() + flash_attn?: boolean; + + @ApiProperty({ + description: 'KV cache type: f16, q8_0, q4_0, default is f16', + example: 'f16', + }) + @IsOptional() + @IsString() + cache_type?: string; + + @ApiProperty({ + description: 'To enable mmap, default is true', + example: true, + }) + @IsOptional() + @IsBoolean() + use_mmap?: boolean; + @ApiProperty({ example: 'cortex.llamacpp', description: 'The engine to use.', diff --git a/cortex-js/src/infrastructure/dtos/models/model.dto.ts b/cortex-js/src/infrastructure/dtos/models/model.dto.ts index 0edecfb47..0c0d3dc96 100644 --- a/cortex-js/src/infrastructure/dtos/models/model.dto.ts +++ b/cortex-js/src/infrastructure/dtos/models/model.dto.ts @@ -1,6 +1,12 @@ import { Model } from '@/domain/models/model.interface'; import { ApiProperty } from '@nestjs/swagger'; -import { IsArray, IsBoolean, IsNumber, IsOptional } from 'class-validator'; +import { + IsArray, + IsBoolean, + IsNumber, + IsOptional, + IsString, +} from 'class-validator'; export class ModelDto implements Partial { @ApiProperty({ @@ -85,7 +91,7 @@ export class ModelDto implements Partial { presence_penalty?: number; // Engine Settings - @ApiProperty({ description: 'Determines GPU layer usage.', example: 4096 }) + @ApiProperty({ description: 'Determines GPU layer usage.', example: 32 }) @IsOptional() @IsNumber() ngl?: number; @@ -101,15 +107,95 @@ export class ModelDto implements Partial { @ApiProperty({ description: - 'Determines CPU inference threads, limited by hardware and OS. ', + 'Determines CPU inference threads, limited by hardware and OS.', + example: 10, }) @IsOptional() @IsNumber() cpu_threads?: number; @ApiProperty({ - example: 'cortex.llamacpp', + description: 'The prompt to use for internal configuration', + }) + @IsOptional() + @IsString() + pre_prompt?: string; + + @ApiProperty({ + description: 'The batch size for prompt eval step', + example: 512, + }) + @IsOptional() + @IsNumber() + n_batch?: number; + + @ApiProperty({ + description: 'To enable prompt caching or not', + example: true, + }) + @IsOptional() + @IsBoolean() + caching_enabled?: boolean; + + @ApiProperty({ + description: 'Group attention factor in self-extend', + example: 1, + }) + @IsOptional() + @IsNumber() + grp_attn_n?: number; + + @ApiProperty({ + description: 'Group attention width in self-extend', + example: 512, + }) + @IsOptional() + @IsNumber() + grp_attn_w?: number; + + @ApiProperty({ + description: 'Prevent system swapping of the model to disk in macOS', + example: false, + }) + @IsOptional() + @IsBoolean() + mlock?: boolean; + + @ApiProperty({ + description: + 'You can constrain the sampling using GBNF grammars by providing path to a grammar file', + }) + @IsOptional() + @IsString() + grammar_file?: string; + + @ApiProperty({ + description: 'To enable Flash Attention, default is true', + example: true, + }) + @IsOptional() + @IsBoolean() + flash_attn?: boolean; + + @ApiProperty({ + description: 'KV cache type: f16, q8_0, q4_0, default is f16', + example: 'f16', + }) + @IsOptional() + @IsString() + cache_type?: string; + + @ApiProperty({ + description: 'To enable mmap, default is true', + example: true, + }) + @IsOptional() + @IsBoolean() + use_mmap?: boolean; + + @ApiProperty({ description: 'The engine to use.', + example: 'cortex.llamacpp', }) @IsOptional() engine?: string; diff --git a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts index 6d0b26e6b..9c10f75a9 100644 --- a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts +++ b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts @@ -4,7 +4,10 @@ import { PromptTemplate } from '@/domain/models/prompt-template.interface'; import { join } from 'path'; import { Model, ModelSettingParams } from '@/domain/models/model.interface'; import { HttpService } from '@nestjs/axios'; -import { defaultCortexCppHost, defaultCortexCppPort } from '@/infrastructure/constants/cortex'; +import { + defaultCortexCppHost, + defaultCortexCppPort, +} from '@/infrastructure/constants/cortex'; import { readdirSync } from 'node:fs'; import { normalizeModelId } from '@/infrastructure/commanders/utils/normalize-model-id'; import { firstValueFrom } from 'rxjs'; diff --git a/cortex-js/src/usecases/threads/threads.usecases.ts b/cortex-js/src/usecases/threads/threads.usecases.ts index d40b9d9d6..6cf5e0c46 100644 --- a/cortex-js/src/usecases/threads/threads.usecases.ts +++ b/cortex-js/src/usecases/threads/threads.usecases.ts @@ -55,6 +55,7 @@ export class ThreadsUsecases { order: 'asc' | 'desc', after?: string, before?: string, + // eslint-disable-next-line @typescript-eslint/no-unused-vars runId?: string, ) { await this.getThreadOrThrow(threadId);