diff --git a/cortex-js/src/infrastructure/commanders/models/model-start.command.ts b/cortex-js/src/infrastructure/commanders/models/model-start.command.ts index 8b99c2cc6..fe50cfc1d 100644 --- a/cortex-js/src/infrastructure/commanders/models/model-start.command.ts +++ b/cortex-js/src/infrastructure/commanders/models/model-start.command.ts @@ -13,6 +13,7 @@ import { InitCliUsecases } from '../usecases/init.cli.usecases'; import { existsSync } from 'node:fs'; import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service'; import { join } from 'node:path'; +import { Engines } from '../types/engine.interface'; type ModelStartOptions = { attach: boolean; @@ -71,7 +72,7 @@ export class ModelStartCommand extends CommandRunner { engine, ); } - if (engine === 'cortex.onnx' && process.platform !== 'win32') { + if (engine === Engines.onnx && process.platform !== 'win32') { console.error('The ONNX engine does not support this OS yet.'); process.exit(1); } diff --git a/cortex-js/src/infrastructure/commanders/types/engine.interface.ts b/cortex-js/src/infrastructure/commanders/types/engine.interface.ts new file mode 100644 index 000000000..b033be369 --- /dev/null +++ b/cortex-js/src/infrastructure/commanders/types/engine.interface.ts @@ -0,0 +1,5 @@ +export enum Engines { + llamaCPP = 'cortex.llamacpp', + onnx = 'cortex.onnx', + tensorrtLLM = 'cortex.tensorrt-llm', +} diff --git a/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts index 4990a4073..06254beb4 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts @@ -63,6 +63,8 @@ export class ChatCliUsecases { rl.on('line', sendCompletionMessage.bind(this)); async function sendCompletionMessage(userInput: string) { + if (!userInput || userInput.trim() === '') return; + if (userInput.trim() === this.exitClause) { rl.close(); return; @@ -98,12 +100,7 @@ export class ChatCliUsecases { model: modelId, stream: true, max_tokens: 4098, - stop: [], - frequency_penalty: 0.7, - presence_penalty: 0.7, temperature: 0.7, - top_p: 0.7, - // Override with model settings ...parser.parseModelInferenceParams(model), }; diff --git a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts index 6a5bb9f91..57e8dd54f 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts @@ -12,11 +12,12 @@ import { rm } from 'fs/promises'; import { exec } from 'child_process'; import { appPath } from '@/utils/app-path'; import { - CORTEX_ONNX_ENGINE_RELEASES_URL, + CORTEX_ENGINE_RELEASES_URL, CORTEX_RELEASES_URL, CUDA_DOWNLOAD_URL, } from '@/infrastructure/constants/cortex'; import { checkNvidiaGPUExist, cudaVersion } from '@/utils/cuda'; +import { Engines } from '../types/engine.interface'; @Injectable() export class InitCliUsecases { @@ -70,13 +71,14 @@ export class InitCliUsecases { ) await this.installLlamaCppEngine(options, version); - if (engine === 'cortex.onnx' && process.platform === 'win32') - await this.installONNXEngine(); - else if (engine === 'cortex.onnx' && process.platform !== 'win32') { + if (engine === Engines.onnx && process.platform !== 'win32') { console.error('The ONNX engine does not support this OS yet.'); process.exit(1); } + if (engine !== 'cortex.llamacpp') + await this.installAcceleratedEngine('latest', engine); + configs.initialized = true; await this.fileManagerService.writeConfigFile(configs); }; @@ -305,17 +307,17 @@ export class InitCliUsecases { }; /** - * Download and install ONNX engine + * Download and install accelerated engine * @param version * @param engineFileName */ - private async installONNXEngine( + private async installAcceleratedEngine( version: string = 'latest', - engineFileName: string = 'windows-amd64', + engine: string = Engines.onnx, ) { const res = await firstValueFrom( this.httpService.get( - CORTEX_ONNX_ENGINE_RELEASES_URL + + CORTEX_ENGINE_RELEASES_URL(engine) + `${version === 'latest' ? '/latest' : ''}`, { headers: { @@ -338,15 +340,17 @@ export class InitCliUsecases { ); } const toDownloadAsset = release.assets.find((s: any) => - s.name.includes(engineFileName), + s.name.includes(process.platform === 'win32' ? 'windows' : 'linux'), ); if (!toDownloadAsset) { - console.log(`Could not find engine file ${engineFileName}`); + console.log( + `Could not find engine file for platform ${process.platform}`, + ); exit(1); } - console.log(`Downloading ONNX engine file ${engineFileName}`); + console.log(`Downloading engine file ${toDownloadAsset.name}`); const dataFolderPath = await this.fileManagerService.getDataFolderPath(); const engineDir = join(dataFolderPath, 'cortex-cpp'); @@ -397,10 +401,10 @@ export class InitCliUsecases { await rm(destination, { force: true }); // Copy the additional files to the cortex-cpp directory - for (const file of readdirSync(join(engineDir, 'engines', 'cortex.onnx'))) { + for (const file of readdirSync(join(engineDir, 'engines', engine))) { if (file !== 'engine.dll') { await cpSync( - join(engineDir, 'engines', 'cortex.onnx', file), + join(engineDir, 'engines', engine, file), join(engineDir, file), ); } diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts index 3db95fe04..7d6d133cb 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts @@ -16,9 +16,10 @@ import { join, basename } from 'path'; import { load } from 'js-yaml'; import { existsSync, readdirSync, readFileSync } from 'fs'; import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id'; -import { getHFModelMetadata } from '@/utils/huggingface'; +import { fetchJanRepoData, getHFModelMetadata } from '@/utils/huggingface'; import { createWriteStream, mkdirSync, promises } from 'node:fs'; import { firstValueFrom } from 'rxjs'; +import { Engines } from '../types/engine.interface'; @Injectable() export class ModelsCliUsecases { @@ -120,8 +121,8 @@ export class ModelsCliUsecases { process.exit(1); } - if (modelId.includes('onnx')) { - await this.pullOnnxModel(modelId); + if (modelId.includes('onnx') || modelId.includes('tensorrt')) { + await this.pullEngineModelFiles(modelId); } else { await this.pullGGUFModel(modelId); const bar = new SingleBar({}, Presets.shades_classic); @@ -151,10 +152,10 @@ export class ModelsCliUsecases { } /** - * It's to pull ONNX model from HuggingFace repository + * It's to pull engine model files from HuggingFace repository * @param modelId */ - private async pullOnnxModel(modelId: string) { + private async pullEngineModelFiles(modelId: string) { const modelsContainerDir = await this.fileService.getModelsPath(); if (!existsSync(modelsContainerDir)) { @@ -164,35 +165,22 @@ export class ModelsCliUsecases { const modelFolder = join(modelsContainerDir, normalizeModelId(modelId)); await promises.mkdir(modelFolder, { recursive: true }).catch(() => {}); - const files = [ - 'genai_config.json', - 'model.onnx', - 'model.onnx.data', - 'model.yml', - 'special_tokens_map.json', - 'tokenizer.json', - 'tokenizer_config.json', - ]; - const repo = modelId.split(':')[0]; - const branch = modelId.split(':')[1] || 'default'; + const files = (await fetchJanRepoData(modelId)).siblings; for (const file of files) { - console.log(`Downloading ${file}`); + console.log(`Downloading ${file.rfilename}`); const bar = new SingleBar({}, Presets.shades_classic); bar.start(100, 0); const response = await firstValueFrom( - this.httpService.get( - `https://huggingface.co/cortexhub/${repo}/resolve/${branch}/${file}?download=true`, - { - responseType: 'stream', - }, - ), + this.httpService.get(file.downloadUrl ?? '', { + responseType: 'stream', + }), ); if (!response) { throw new Error('Failed to download model'); } await new Promise((resolve, reject) => { - const writer = createWriteStream(join(modelFolder, file)); + const writer = createWriteStream(join(modelFolder, file.rfilename)); let receivedBytes = 0; const totalBytes = response.headers['content-length']; @@ -281,7 +269,7 @@ export class ModelsCliUsecases { // Default Model Settings ctx_len: 4096, ngl: 100, - engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp', + engine: Engines.llamaCPP, }; if (!(await this.modelsUsecases.findOne(modelId))) await this.modelsUsecases.create(model); diff --git a/cortex-js/src/infrastructure/constants/benchmark.ts b/cortex-js/src/infrastructure/constants/benchmark.ts index a55574c73..083b46584 100644 --- a/cortex-js/src/infrastructure/constants/benchmark.ts +++ b/cortex-js/src/infrastructure/constants/benchmark.ts @@ -18,7 +18,6 @@ export const defaultBenchmarkConfiguration: BenchmarkConfig = { model: 'tinyllama', stream: true, max_tokens: 2048, - stop: [], frequency_penalty: 0, presence_penalty: 0, temperature: 0.7, diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts index ad0690e05..45eeb96cb 100644 --- a/cortex-js/src/infrastructure/constants/cortex.ts +++ b/cortex-js/src/infrastructure/constants/cortex.ts @@ -42,8 +42,8 @@ export const CORTEX_JS_STOP_API_SERVER_URL = ( export const CORTEX_RELEASES_URL = 'https://api.github.com/repos/janhq/cortex/releases'; -export const CORTEX_ONNX_ENGINE_RELEASES_URL = - 'https://api.github.com/repos/janhq/cortex.onnx/releases'; +export const CORTEX_ENGINE_RELEASES_URL = (engine: string) => + `https://api.github.com/repos/janhq/${engine}/releases`; export const CUDA_DOWNLOAD_URL = 'https://catalog.jan.ai/dist/cuda-dependencies///cuda.tar.gz'; diff --git a/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts b/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts index 5664f80c2..f9bc3d809 100644 --- a/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts +++ b/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts @@ -7,6 +7,7 @@ import { EngineExtension } from '@/domain/abstracts/engine.abstract'; import { appPath } from '@/utils/app-path'; import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service'; import { existsSync } from 'fs'; +import { Engines } from '@/infrastructure/commanders/types/engine.interface'; @Injectable() export class ExtensionRepositoryImpl implements ExtensionRepository { @@ -18,9 +19,9 @@ export class ExtensionRepositoryImpl implements ExtensionRepository { private readonly cortexProvider: EngineExtension, private readonly fileService: FileManagerService, ) { - this.extensions.set('cortex.llamacpp', this.cortexProvider); - this.extensions.set('cortex.onnx', this.cortexProvider); - this.extensions.set('cortex.tensorrt-llm', this.cortexProvider); + this.extensions.set(Engines.llamaCPP, this.cortexProvider); + this.extensions.set(Engines.onnx, this.cortexProvider); + this.extensions.set(Engines.tensorrtLLM, this.cortexProvider); this.loadCoreExtensions(); this.loadExternalExtensions(); } diff --git a/cortex-js/src/usecases/models/models.usecases.ts b/cortex-js/src/usecases/models/models.usecases.ts index 1b7965d2b..a1615b970 100644 --- a/cortex-js/src/usecases/models/models.usecases.ts +++ b/cortex-js/src/usecases/models/models.usecases.ts @@ -40,6 +40,7 @@ import { EventEmitter2 } from '@nestjs/event-emitter'; import { ModelEvent, ModelId, ModelStatus } from '@/domain/models/model.event'; import { DownloadManagerService } from '@/infrastructure/services/download-manager/download-manager.service'; import { ContextService } from '@/infrastructure/services/context/context.service'; +import { Engines } from '@/infrastructure/commanders/types/engine.interface'; @Injectable() export class ModelsUsecases { @@ -466,7 +467,7 @@ export class ModelsUsecases { // Default Model Settings ctx_len: 4096, ngl: 100, - engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp', + engine: Engines.llamaCPP, }; if (!(await this.findOne(modelId))) await this.create(model); } diff --git a/cortex-js/src/utils/cuda.ts b/cortex-js/src/utils/cuda.ts index a20fa17f3..fc252a20a 100644 --- a/cortex-js/src/utils/cuda.ts +++ b/cortex-js/src/utils/cuda.ts @@ -3,6 +3,13 @@ import { existsSync } from 'fs'; import { delimiter } from 'path'; import { checkFileExistenceInPaths } from './app-path'; +export type GpuSettingInfo = { + id: string; + vram: string; + name: string; + arch?: string; +}; + /** * Return the CUDA version installed on the system * @returns CUDA Version 11 | 12 @@ -63,3 +70,46 @@ export const checkNvidiaGPUExist = (): Promise => { }); }); }; + +/** + * Get GPU information from the system + * @returns GPU information + */ +export const getGpuInfo = async (): Promise => + new Promise((resolve) => { + exec( + 'nvidia-smi --query-gpu=index,memory.total,name --format=csv,noheader,nounits', + async (error, stdout) => { + if (!error) { + // Get GPU info and gpu has higher memory first + let highestVram = 0; + let highestVramId = '0'; + const gpus: GpuSettingInfo[] = stdout + .trim() + .split('\n') + .map((line) => { + let [id, vram, name] = line.split(', '); + const arch = getGpuArch(name); + vram = vram.replace(/\r/g, ''); + if (parseFloat(vram) > highestVram) { + highestVram = parseFloat(vram); + highestVramId = id; + } + return { id, vram, name, arch }; + }); + + resolve(gpus); + } else { + resolve([]); + } + }, + ); + }); + +const getGpuArch = (gpuName: string): string => { + if (!gpuName.toLowerCase().includes('nvidia')) return 'unknown'; + + if (gpuName.includes('30')) return 'ampere'; + else if (gpuName.includes('40')) return 'ada'; + else return 'unknown'; +}; diff --git a/cortex-js/src/utils/huggingface.ts b/cortex-js/src/utils/huggingface.ts index 975b8fe89..de2a65dff 100644 --- a/cortex-js/src/utils/huggingface.ts +++ b/cortex-js/src/utils/huggingface.ts @@ -20,6 +20,7 @@ import { } from '@/infrastructure/constants/prompt-constants'; import { gguf } from '@huggingface/gguf'; import axios from 'axios'; +import { parseModelHubEngineBranch } from './normalize-model-id'; // TODO: move this to somewhere else, should be reused by API as well. Maybe in a separate service / provider? export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string { @@ -64,7 +65,6 @@ export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string { export async function fetchHuggingFaceRepoData( repoId: string, ): Promise { - const sanitizedUrl = getRepoModelsUrl(repoId); const { data: response } = await axios.get(sanitizedUrl); @@ -113,7 +113,7 @@ export async function fetchJanRepoData( modelId: string, ): Promise { const repo = modelId.split(':')[0]; - const tree = modelId.split(':')[1] ?? 'default'; + const tree = await parseModelHubEngineBranch(modelId.split(':')[1] ?? 'default'); const url = getRepoModelsUrl(`cortexhub/${repo}`, tree); const res = await fetch(url); @@ -164,8 +164,6 @@ export async function fetchJanRepoData( data.modelUrl = url; - - return data; } diff --git a/cortex-js/src/utils/normalize-model-id.ts b/cortex-js/src/utils/normalize-model-id.ts index 8c98e935e..f5d9e0b51 100644 --- a/cortex-js/src/utils/normalize-model-id.ts +++ b/cortex-js/src/utils/normalize-model-id.ts @@ -1,4 +1,5 @@ import { ModelArtifact } from '@/domain/models/model.interface'; +import { getGpuInfo } from './cuda'; export const normalizeModelId = (modelId: string): string => { return modelId.replace(':default', '').replace(/[:/]/g, '-'); @@ -13,3 +14,27 @@ export const isLocalModel = ( !/^(http|https):\/\/[^/]+\/.*/.test(modelFiles[0]) ); }; + +/** + * Parse the model hub engine branch + * @param branch + * @returns + */ +export const parseModelHubEngineBranch = async ( + branch: string, +): Promise => { + if (branch.includes('tensorrt')) { + let engineBranch = branch; + const platform = process.platform == 'win32' ? 'windows' : 'linux'; + if (!engineBranch.includes(platform)) { + engineBranch += `-${platform}`; + } + + const gpus = await getGpuInfo(); + if (gpus[0]?.arch && !engineBranch.includes(gpus[0].arch)) { + engineBranch += `-${gpus[0].arch}`; + } + return engineBranch; + } + return branch; +};