From f19a5f65b2861fb79e4f23443ed3ce5297573db7 Mon Sep 17 00:00:00 2001 From: Louis Le Date: Mon, 17 Jun 2024 23:54:17 +0700 Subject: [PATCH 1/3] feat: ship ONNX runtime on Windows --- .../commanders/usecases/init.cli.usecases.ts | 119 +++++++++++++++- .../usecases/models.cli.usecases.ts | 128 +++++++++++++++--- .../src/infrastructure/constants/cortex.ts | 3 + .../infrastructure/constants/huggingface.ts | 2 +- .../providers/cortex/cortex.provider.ts | 2 +- .../extensions/extension.repository.ts | 5 +- cortex-js/src/usecases/chat/chat.usecases.ts | 7 +- .../src/usecases/models/models.usecases.ts | 21 +-- cortex-js/src/utils/huggingface.ts | 11 +- 9 files changed, 254 insertions(+), 44 deletions(-) diff --git a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts index 9229db882..cc053df6e 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts @@ -1,4 +1,11 @@ -import { createWriteStream, existsSync, rmSync } from 'fs'; +import { + cpSync, + createWriteStream, + existsSync, + readdir, + readdirSync, + rmSync, +} from 'fs'; import { delimiter, join } from 'path'; import { HttpService } from '@nestjs/axios'; import { Presets, SingleBar } from 'cli-progress'; @@ -12,6 +19,7 @@ import { rm } from 'fs/promises'; import { exec } from 'child_process'; import { appPath } from '@/utils/app-path'; import { + CORTEX_ONNX_ENGINE_RELEASES_URL, CORTEX_RELEASES_URL, CUDA_DOWNLOAD_URL, } from '@/infrastructure/constants/cortex'; @@ -59,7 +67,7 @@ export class InitCliUsecases { exit(1); } - console.log(`Downloading engine file ${engineFileName}`); + console.log(`Downloading Llama.cpp engine file ${engineFileName}`); const dataFolderPath = await this.fileManagerService.getDataFolderPath(); const engineDir = join(dataFolderPath, 'cortex-cpp'); if (existsSync(engineDir)) rmSync(engineDir, { recursive: true }); @@ -109,6 +117,9 @@ export class InitCliUsecases { exit(1); } await rm(destination, { force: true }); + + // Ship ONNX Runtime on Windows by default + if (process.platform === 'win32') await this.installONNXEngine(); }; parseEngineFileName = (options?: InitOptions) => { @@ -187,6 +198,7 @@ export class InitCliUsecases { ).replace('', platform); const destination = join(dataFolderPath, 'cuda-toolkit.tar.gz'); + console.log('Downloading CUDA Toolkit dependency...'); const download = await firstValueFrom( this.httpService.get(url, { responseType: 'stream', @@ -283,6 +295,109 @@ export class InitCliUsecases { }); }; + /** + * Download and install ONNX engine + * @param version + * @param engineFileName + */ + async installONNXEngine( + version: string = 'v0.1.1', + engineFileName: string = 'windows-amd64', + ) { + const res = await firstValueFrom( + this.httpService.get( + CORTEX_ONNX_ENGINE_RELEASES_URL + + `${version === 'latest' ? '/latest' : ''}`, + { + headers: { + 'X-GitHub-Api-Version': '2022-11-28', + Accept: 'application/vnd.github+json', + }, + }, + ), + ); + + if (!res?.data) { + console.log('Failed to fetch releases'); + exit(1); + } + + let release = res?.data; + if (Array.isArray(res?.data)) { + release = Array(res?.data)[0].find( + (e) => e.name === version.replace('v', ''), + ); + } + const toDownloadAsset = release.assets.find((s: any) => + s.name.includes(engineFileName), + ); + + if (!toDownloadAsset) { + console.log(`Could not find engine file ${engineFileName}`); + exit(1); + } + + console.log(`Downloading ONNX engine file ${engineFileName}`); + const dataFolderPath = await this.fileManagerService.getDataFolderPath(); + const engineDir = join(dataFolderPath, 'cortex-cpp'); + + const download = await firstValueFrom( + this.httpService.get(toDownloadAsset.browser_download_url, { + responseType: 'stream', + }), + ); + if (!download) { + console.log('Failed to download model'); + process.exit(1); + } + + const destination = join(dataFolderPath, toDownloadAsset.name); + + await new Promise((resolve, reject) => { + const writer = createWriteStream(destination); + let receivedBytes = 0; + const totalBytes = download.headers['content-length']; + + writer.on('finish', () => { + bar.stop(); + resolve(true); + }); + + writer.on('error', (error) => { + bar.stop(); + reject(error); + }); + + const bar = new SingleBar({}, Presets.shades_classic); + bar.start(100, 0); + + download.data.on('data', (chunk: any) => { + receivedBytes += chunk.length; + bar.update(Math.floor((receivedBytes / totalBytes) * 100)); + }); + + download.data.pipe(writer); + }); + + try { + await decompress(destination, join(engineDir, 'engines')); + } catch (e) { + console.error('Error decompressing file', e); + exit(1); + } + await rm(destination, { force: true }); + + // Copy the additional files to the cortex-cpp directory + for (const file of readdirSync(join(engineDir, 'engines', 'cortex.onnx'))) { + if (file !== 'engine.dll') { + await cpSync( + join(engineDir, 'engines', 'cortex.onnx', file), + join(engineDir, file), + ); + } + } + } + private checkFileExistenceInPaths = ( file: string, paths: string[], diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts index b3ddb5470..55ce533bd 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts @@ -17,6 +17,8 @@ import { load } from 'js-yaml'; import { existsSync, readdirSync, readFileSync } from 'fs'; import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id'; import { getHFModelMetadata } from '@/utils/huggingface'; +import { createWriteStream, mkdirSync, promises } from 'node:fs'; +import { firstValueFrom } from 'rxjs'; @Injectable() export class ModelsCliUsecases { @@ -118,40 +120,116 @@ export class ModelsCliUsecases { process.exit(1); } - await this.pullHuggingFaceModel(modelId); - const bar = new SingleBar({}, Presets.shades_classic); - bar.start(100, 0); - const callback = (progress: number) => { - bar.update(progress); - }; + if (modelId.includes('onnx')) { + await this.pullOnnxModel(modelId); + } else { + await this.pullGGUFModel(modelId); + const bar = new SingleBar({}, Presets.shades_classic); + bar.start(100, 0); + const callback = (progress: number) => { + bar.update(progress); + }; + + try { + await this.modelsUsecases.downloadModel(modelId, callback); + + const model = await this.modelsUsecases.findOne(modelId); + const fileUrl = join( + await this.fileService.getModelsPath(), + normalizeModelId(modelId), + basename((model?.files as string[])[0]), + ); + await this.modelsUsecases.update(modelId, { + files: [fileUrl], + name: modelId.replace(':default', ''), + }); + } catch (err) { + bar.stop(); + throw err; + } + } + } + + /** + * It's to pull ONNX model from HuggingFace repository + * @param modelId + */ + private async pullOnnxModel(modelId: string) { + const modelsContainerDir = await this.fileService.getModelsPath(); + + if (!existsSync(modelsContainerDir)) { + mkdirSync(modelsContainerDir, { recursive: true }); + } + + const modelFolder = join(modelsContainerDir, normalizeModelId(modelId)); + await promises.mkdir(modelFolder, { recursive: true }).catch(() => {}); - try { - await this.modelsUsecases.downloadModel(modelId, callback); + const files = [ + 'genai_config.json', + 'model.onnx', + 'model.onnx.data', + 'model.yml', + 'special_tokens_map.json', + 'tokenizer.json', + 'tokenizer_config.json', + ]; + const repo = modelId.split(':')[0]; + const branch = modelId.split(':')[1] || 'default'; + for (const file of files) { + console.log(`Downloading ${file}`); + const bar = new SingleBar({}, Presets.shades_classic); + bar.start(100, 0); - const model = await this.modelsUsecases.findOne(modelId); - const fileUrl = join( - await this.fileService.getModelsPath(), - normalizeModelId(modelId), - basename((model?.files as string[])[0]), + const response = await firstValueFrom( + this.httpService.get( + `https://huggingface.co/cortexhub/${repo}/resolve/${branch}/${file}?download=true`, + { + responseType: 'stream', + }, + ), ); - await this.modelsUsecases.update(modelId, { - files: [fileUrl], - name: modelId.replace(':default', ''), + if (!response) { + throw new Error('Failed to download model'); + } + + await new Promise((resolve, reject) => { + const writer = createWriteStream(join(modelFolder, file)); + let receivedBytes = 0; + const totalBytes = response.headers['content-length']; + + writer.on('finish', () => { + resolve(true); + }); + + writer.on('error', (error) => { + reject(error); + }); + + response.data.on('data', (chunk: any) => { + receivedBytes += chunk.length; + bar.update(Math.floor((receivedBytes / totalBytes) * 100)); + }); + + response.data.pipe(writer); }); - } catch (err) { bar.stop(); - throw err; } - } - //// PRIVATE METHODS //// + const model: CreateModelDto = load( + readFileSync(join(modelFolder, 'model.yml'), 'utf-8'), + ) as CreateModelDto; + model.files = [join(modelFolder)]; + model.model = modelId + if (!(await this.modelsUsecases.findOne(modelId))) + await this.modelsUsecases.create(model); + } /** * It's to pull model from HuggingFace repository * It could be a model from Jan's repo or other authors * @param modelId HuggingFace model id. e.g. "janhq/llama-3 or llama3:7b" */ - private async pullHuggingFaceModel(modelId: string) { + private async pullGGUFModel(modelId: string) { const data: HuggingFaceRepoData = await this.modelsUsecases.fetchModelMetadata(modelId); @@ -179,6 +257,7 @@ export class ModelsCliUsecases { } else { modelVersion = data.siblings.find((e) => e.rfilename.includes('.gguf')); } + if (!modelVersion) throw 'No expected quantization found'; const metadata = await getHFModelMetadata(modelVersion.downloadUrl!); @@ -203,12 +282,17 @@ export class ModelsCliUsecases { // Default Model Settings ctx_len: 4096, ngl: 100, - engine: 'cortex.llamacpp', + engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp', }; if (!(await this.modelsUsecases.findOne(modelId))) await this.modelsUsecases.create(model); } + /** + * Parse preset file + * @param preset + * @returns + */ private async parsePreset(preset?: string): Promise { const presetsFolder = await this.fileService.getPresetsPath(); diff --git a/cortex-js/src/infrastructure/constants/cortex.ts b/cortex-js/src/infrastructure/constants/cortex.ts index dc21b027e..7d458da59 100644 --- a/cortex-js/src/infrastructure/constants/cortex.ts +++ b/cortex-js/src/infrastructure/constants/cortex.ts @@ -42,6 +42,9 @@ export const CORTEX_JS_STOP_API_SERVER_URL = ( export const CORTEX_RELEASES_URL = 'https://api.github.com/repos/janhq/cortex/releases'; +export const CORTEX_ONNX_ENGINE_RELEASES_URL = + 'https://api.github.com/repos/janhq/cortex.onnx/releases'; + export const CUDA_DOWNLOAD_URL = 'https://catalog.jan.ai/dist/cuda-dependencies///cuda.tar.gz'; diff --git a/cortex-js/src/infrastructure/constants/huggingface.ts b/cortex-js/src/infrastructure/constants/huggingface.ts index b2282e1f1..1e1c89a78 100644 --- a/cortex-js/src/infrastructure/constants/huggingface.ts +++ b/cortex-js/src/infrastructure/constants/huggingface.ts @@ -2,7 +2,7 @@ export const HUGGING_FACE_TREE_REF_URL = ( repo: string, tree: string, path: string, -) => `https://huggingface.co/janhq/${repo}/resolve/${tree}/${path}`; +) => `https://huggingface.co/cortexhub/${repo}/resolve/${tree}/${path}`; export const HUGGING_FACE_DOWNLOAD_FILE_MAIN_URL = ( author: string, diff --git a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts index d186a6278..7e2dc8618 100644 --- a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts +++ b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts @@ -15,7 +15,7 @@ import { FileManagerService } from '@/infrastructure/services/file-manager/file- @Injectable() export default class CortexProvider extends OAIEngineExtension { - provider: string = 'cortex.llamacpp'; + provider: string = 'cortex'; apiUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/chat_completion`; private loadModelUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/loadmodel`; diff --git a/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts b/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts index 484cea812..389e5f371 100644 --- a/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts +++ b/cortex-js/src/infrastructure/repositories/extensions/extension.repository.ts @@ -11,7 +11,10 @@ import { existsSync } from 'fs'; @Injectable() export class ExtensionRepositoryImpl implements ExtensionRepository { // Initialize the Extensions Map with the key-value pairs of the core providers. - extensions = new Map([['cortex', this.cortexProvider]]); + extensions = new Map([ + ['cortex.llamacpp', this.cortexProvider], + ['cortex.onnx', this.cortexProvider], + ]); constructor( @Inject('CORTEX_PROVIDER') diff --git a/cortex-js/src/usecases/chat/chat.usecases.ts b/cortex-js/src/usecases/chat/chat.usecases.ts index dd3d41a0e..806ebb06a 100644 --- a/cortex-js/src/usecases/chat/chat.usecases.ts +++ b/cortex-js/src/usecases/chat/chat.usecases.ts @@ -25,15 +25,14 @@ export class ChatUsecases { headers: Record, ): Promise { const { model: modelId } = createChatDto; - const extensions = (await this.extensionRepository.findAll()) ?? []; const model = await this.modelRepository.findOne(modelId); if (!model) { throw new ModelNotFoundException(modelId); } - const engine = extensions.find((e: any) => e.provider === model?.engine) as - | EngineExtension - | undefined; + const engine = (await this.extensionRepository.findOne( + model!.engine ?? 'cortex.llamacpp', + )) as EngineExtension | undefined; if (engine == null) { throw new Error(`No engine found with name: ${model.engine}`); diff --git a/cortex-js/src/usecases/models/models.usecases.ts b/cortex-js/src/usecases/models/models.usecases.ts index a14c3230a..c759cea6a 100644 --- a/cortex-js/src/usecases/models/models.usecases.ts +++ b/cortex-js/src/usecases/models/models.usecases.ts @@ -28,6 +28,7 @@ import { HuggingFaceModelVersion, HuggingFaceRepoData, } from '@/domain/models/huggingface.interface'; + import { LLAMA_2 } from '@/infrastructure/constants/prompt-constants'; import { isValidUrl } from '@/utils/urls'; import { @@ -147,10 +148,9 @@ export class ModelsUsecases { settings?: ModelSettingParams, ): Promise { const model = await this.getModelOrThrow(modelId); - const extensions = (await this.extensionRepository.findAll()) ?? []; - const engine = extensions.find((e: any) => e.provider === model?.engine) as - | EngineExtension - | undefined; + const engine = (await this.extensionRepository.findOne( + model!.engine ?? 'cortex.llamacpp', + )) as EngineExtension | undefined; if (!engine) { return { @@ -180,8 +180,9 @@ export class ModelsUsecases { Array.isArray(model.files) && !('llama_model_path' in model) && { llama_model_path: (model.files as string[])[0], + model_path: (model.files as string[])[0], }), - engine: 'cortex.llamacpp', + engine: model.engine ?? 'cortex.llamacpp', // User / Model settings ...parser.parseModelEngineSettings(model), ...parser.parseModelEngineSettings(settings ?? {}), @@ -233,10 +234,9 @@ export class ModelsUsecases { async stopModel(modelId: string): Promise { const model = await this.getModelOrThrow(modelId); - const extensions = (await this.extensionRepository.findAll()) ?? []; - const engine = extensions.find((e: any) => e.provider === model?.engine) as - | EngineExtension - | undefined; + const engine = (await this.extensionRepository.findOne( + model!.engine ?? 'cortex.llamacpp', + )) as EngineExtension | undefined; if (!engine) { return { @@ -419,6 +419,7 @@ export class ModelsUsecases { modelVersion: HuggingFaceModelVersion, ) { if (!modelVersion) throw 'No expected quantization found'; + const tokenizer = await getHFModelMetadata(modelVersion.downloadUrl!); const promptTemplate = tokenizer?.promptTemplate ?? LLAMA_2; @@ -442,7 +443,7 @@ export class ModelsUsecases { // Default Model Settings ctx_len: 4096, ngl: 100, - engine: 'cortex.llamacpp', + engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp', }; if (!(await this.findOne(modelId))) await this.create(model); } diff --git a/cortex-js/src/utils/huggingface.ts b/cortex-js/src/utils/huggingface.ts index 68a6c38f9..975b8fe89 100644 --- a/cortex-js/src/utils/huggingface.ts +++ b/cortex-js/src/utils/huggingface.ts @@ -64,6 +64,7 @@ export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string { export async function fetchHuggingFaceRepoData( repoId: string, ): Promise { + const sanitizedUrl = getRepoModelsUrl(repoId); const { data: response } = await axios.get(sanitizedUrl); @@ -113,7 +114,8 @@ export async function fetchJanRepoData( ): Promise { const repo = modelId.split(':')[0]; const tree = modelId.split(':')[1] ?? 'default'; - const url = getRepoModelsUrl(`janhq/${repo}`, tree); + const url = getRepoModelsUrl(`cortexhub/${repo}`, tree); + const res = await fetch(url); const response: | { @@ -140,7 +142,7 @@ export async function fetchJanRepoData( tags: ['gguf'], id: modelId, modelId: modelId, - author: 'janhq', + author: 'cortexhub', sha: '', downloads: 0, lastModified: '', @@ -161,6 +163,9 @@ export async function fetchJanRepoData( }); data.modelUrl = url; + + + return data; } @@ -199,7 +204,7 @@ export async function getHFModelMetadata( version, }; } catch (err) { - console.log('Failed to get model metadata:', err); + console.log('Failed to get model metadata:', err.message); return undefined; } } From 13be3b12ee2675b49c5bbb36e2edf0beaaa39e52 Mon Sep 17 00:00:00 2001 From: Louis Date: Tue, 18 Jun 2024 00:15:15 +0700 Subject: [PATCH 2/3] chore: update `models pull` default registry url so user can access and find available models --- .../src/infrastructure/commanders/models/model-pull.command.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts b/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts index 287fc2258..c6b81d9cf 100644 --- a/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts +++ b/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts @@ -10,7 +10,8 @@ import { ModelNotFoundException } from '@/infrastructure/exception/model-not-fou aliases: ['download'], arguments: '', argsDescription: { model_id: 'Model repo to pull' }, - description: 'Download a model. Working with HuggingFace model id.', + description: + 'Download a model from a registry. Working with HuggingFace repositories. For available models, please visit https://huggingface.co/cortexhub', }) @SetCommandContext() export class ModelPullCommand extends CommandRunner { From 31d9d7184f1db4a1b6a85d2e817a15996bee0b35 Mon Sep 17 00:00:00 2001 From: Louis Le Date: Tue, 18 Jun 2024 14:43:27 +0700 Subject: [PATCH 3/3] chore: get latest onnx --- .../src/infrastructure/commanders/usecases/init.cli.usecases.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts index cc053df6e..5c8fda51c 100644 --- a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts +++ b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts @@ -301,7 +301,7 @@ export class InitCliUsecases { * @param engineFileName */ async installONNXEngine( - version: string = 'v0.1.1', + version: string = 'latest', engineFileName: string = 'windows-amd64', ) { const res = await firstValueFrom(