Skip to content

Commit

Permalink
fix: 3549 inference on cpu is lower since 0.5.3
Browse files Browse the repository at this point in the history
fix: correct extension description

fix: vulkan should not have instructions or run mode included

fix: OS path delimiter

test: add tests

chore: should set run mode to CPU programmatically when ngl is not set

chore: shorten download URL
  • Loading branch information
louis-menlo committed Sep 9, 2024
1 parent fe9936c commit 16fb2f1
Show file tree
Hide file tree
Showing 11 changed files with 910 additions and 363 deletions.
371 changes: 47 additions & 324 deletions core/src/node/api/restful/helper/startStopModel.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,13 @@
import fs from 'fs'
import { join } from 'path'
import {
getJanDataFolderPath,
getJanExtensionsPath,
getSystemResourceInfo,
log,
} from '../../../helper'
import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
import { Model, ModelSettingParams, PromptTemplate } from '../../../../types'
import {
LOCAL_HOST,
NITRO_DEFAULT_PORT,
NITRO_HTTP_KILL_URL,
NITRO_HTTP_LOAD_MODEL_URL,
NITRO_HTTP_VALIDATE_MODEL_URL,
SUPPORTED_MODEL_FORMAT,
} from './consts'

// The subprocess instance for Nitro
let subprocess: ChildProcessWithoutNullStreams | undefined = undefined

// TODO: move this to core type
interface NitroModelSettings extends ModelSettingParams {
llama_model_path: string
cpu_threads: number
}
import { getJanDataFolderPath, getJanExtensionsPath, log } from '../../../helper'
import { ModelSettingParams } from '../../../../types'

/**
* Start a model
* @param modelId
* @param settingParams
* @returns
*/
export const startModel = async (modelId: string, settingParams?: ModelSettingParams) => {
try {
await runModel(modelId, settingParams)
Expand All @@ -40,316 +22,57 @@ export const startModel = async (modelId: string, settingParams?: ModelSettingPa
}
}

const runModel = async (modelId: string, settingParams?: ModelSettingParams): Promise<void> => {
/**
* Run a model using installed cortex extension
* @param model
* @param settingParams
*/
const runModel = async (model: string, settingParams?: ModelSettingParams): Promise<void> => {
const janDataFolderPath = getJanDataFolderPath()
const modelFolderFullPath = join(janDataFolderPath, 'models', modelId)

if (!fs.existsSync(modelFolderFullPath)) {
throw new Error(`Model not found: ${modelId}`)
}

const files: string[] = fs.readdirSync(modelFolderFullPath)

// Look for GGUF model file
const ggufBinFile = files.find((file) => file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT))

const modelMetadataPath = join(modelFolderFullPath, 'model.json')
const modelMetadata: Model = JSON.parse(fs.readFileSync(modelMetadataPath, 'utf-8'))

if (!ggufBinFile) {
throw new Error('No GGUF model file found')
}
const modelBinaryPath = join(modelFolderFullPath, ggufBinFile)

const nitroResourceProbe = await getSystemResourceInfo()
const nitroModelSettings: NitroModelSettings = {
// This is critical and requires real CPU physical core count (or performance core)
cpu_threads: Math.max(1, nitroResourceProbe.numCpuPhysicalCore),
...modelMetadata.settings,
...settingParams,
llama_model_path: modelBinaryPath,
...(modelMetadata.settings.mmproj && {
mmproj: join(modelFolderFullPath, modelMetadata.settings.mmproj),
}),
}

log(`[SERVER]::Debug: Nitro model settings: ${JSON.stringify(nitroModelSettings)}`)

// Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
if (modelMetadata.settings.prompt_template) {
const promptTemplate = modelMetadata.settings.prompt_template
const prompt = promptTemplateConverter(promptTemplate)
if (prompt?.error) {
throw new Error(prompt.error)
}
nitroModelSettings.system_prompt = prompt.system_prompt
nitroModelSettings.user_prompt = prompt.user_prompt
nitroModelSettings.ai_prompt = prompt.ai_prompt
}

await runNitroAndLoadModel(modelId, nitroModelSettings)
}

// TODO: move to util
const promptTemplateConverter = (promptTemplate: string): PromptTemplate => {
// Split the string using the markers
const systemMarker = '{system_message}'
const promptMarker = '{prompt}'

if (promptTemplate.includes(systemMarker) && promptTemplate.includes(promptMarker)) {
// Find the indices of the markers
const systemIndex = promptTemplate.indexOf(systemMarker)
const promptIndex = promptTemplate.indexOf(promptMarker)

// Extract the parts of the string
const system_prompt = promptTemplate.substring(0, systemIndex)
const user_prompt = promptTemplate.substring(systemIndex + systemMarker.length, promptIndex)
const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)

// Return the split parts
return { system_prompt, user_prompt, ai_prompt }
} else if (promptTemplate.includes(promptMarker)) {
// Extract the parts of the string for the case where only promptMarker is present
const promptIndex = promptTemplate.indexOf(promptMarker)
const user_prompt = promptTemplate.substring(0, promptIndex)
const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)

// Return the split parts
return { user_prompt, ai_prompt }
}

// Return an error if none of the conditions are met
return { error: 'Cannot split prompt template' }
}

const runNitroAndLoadModel = async (modelId: string, modelSettings: NitroModelSettings) => {
// Gather system information for CPU physical cores and memory
const tcpPortUsed = require('tcp-port-used')

await stopModel(modelId)
await tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000)

/**
* There is a problem with Windows process manager
* Should wait for awhile to make sure the port is free and subprocess is killed
* The tested threshold is 500ms
**/
if (process.platform === 'win32') {
await new Promise((resolve) => setTimeout(resolve, 500))
}

await spawnNitroProcess()
await loadLLMModel(modelSettings)
await validateModelStatus()
}

const spawnNitroProcess = async (): Promise<void> => {
log(`[SERVER]::Debug: Spawning cortex subprocess...`)

let binaryFolder = join(
const modelFolder = join(janDataFolderPath, 'models', model)
let module = join(
getJanExtensionsPath(),
'@janhq',
'inference-cortex-extension',
'dist',
'bin'
)

let executableOptions = executableNitroFile()
const tcpPortUsed = require('tcp-port-used')

const args: string[] = ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()]
// Execute the binary
log(
`[SERVER]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
'node',
'index.cjs'
)
subprocess = spawn(
executableOptions.executablePath,
['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()],
{
cwd: binaryFolder,
env: {
...process.env,
CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
},
}
// Just reuse the cortex extension implementation, don't duplicate then lost of sync
return import(module).then((extension) =>
extension
.loadModel(
{
modelFolder,
model,
},
settingParams
)
.then(() => log(`[SERVER]::Debug: Model is loaded`))
.then({
message: 'Model started',
})
)

// Handle subprocess output
subprocess.stdout.on('data', (data: any) => {
log(`[SERVER]::Debug: ${data}`)
})

subprocess.stderr.on('data', (data: any) => {
log(`[SERVER]::Error: ${data}`)
})

subprocess.on('close', (code: any) => {
log(`[SERVER]::Debug: cortex exited with code: ${code}`)
subprocess = undefined
})

tcpPortUsed.waitUntilUsed(NITRO_DEFAULT_PORT, 300, 30000).then(() => {
log(`[SERVER]::Debug: cortex is ready`)
})
}

type NitroExecutableOptions = {
executablePath: string
cudaVisibleDevices: string
}

const executableNitroFile = (): NitroExecutableOptions => {
const nvidiaInfoFilePath = join(getJanDataFolderPath(), 'settings', 'settings.json')
let binaryFolder = join(
/*
* Stop model and kill nitro process.
*/
export const stopModel = async (_modelId: string) => {
let module = join(
getJanExtensionsPath(),
'@janhq',
'inference-cortex-extension',
'dist',
'bin'
'node',
'index.cjs'
)

let cudaVisibleDevices = ''
let binaryName = 'cortex-cpp'
/**
* The binary folder is different for each platform.
*/
if (process.platform === 'win32') {
/**
* For Windows: win-cpu, win-cuda-11-7, win-cuda-12-0
*/
let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
if (nvidiaInfo['run_mode'] === 'cpu') {
binaryFolder = join(binaryFolder, 'win-cpu')
} else {
if (nvidiaInfo['cuda'].version === '12') {
binaryFolder = join(binaryFolder, 'win-cuda-12-0')
} else {
binaryFolder = join(binaryFolder, 'win-cuda-11-7')
}
cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
}
binaryName = 'cortex-cpp.exe'
} else if (process.platform === 'darwin') {
/**
* For MacOS: mac-universal both Silicon and InteL
*/
if(process.arch === 'arm64') {
binaryFolder = join(binaryFolder, 'mac-arm64')
} else {
binaryFolder = join(binaryFolder, 'mac-amd64')
}
} else {
/**
* For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
*/
let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
if (nvidiaInfo['run_mode'] === 'cpu') {
binaryFolder = join(binaryFolder, 'linux-cpu')
} else {
if (nvidiaInfo['cuda'].version === '12') {
binaryFolder = join(binaryFolder, 'linux-cuda-12-0')
} else {
binaryFolder = join(binaryFolder, 'linux-cuda-11-7')
}
cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
}
}

return {
executablePath: join(binaryFolder, binaryName),
cudaVisibleDevices,
}
}

const validateModelStatus = async (): Promise<void> => {
// Send a GET request to the validation URL.
// Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
const fetchRT = require('fetch-retry')
const fetchRetry = fetchRT(fetch)

return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
},
retries: 5,
retryDelay: 500,
}).then(async (res: Response) => {
log(`[SERVER]::Debug: Validate model state success with response ${JSON.stringify(res)}`)
// If the response is OK, check model_loaded status.
if (res.ok) {
const body = await res.json()
// If the model is loaded, return an empty object.
// Otherwise, return an object with an error message.
if (body.model_loaded) {
return Promise.resolve()
}
}
return Promise.reject('Validate model status failed')
})
}

const loadLLMModel = async (settings: NitroModelSettings): Promise<Response> => {
log(`[SERVER]::Debug: Loading model with params ${JSON.stringify(settings)}`)
const fetchRT = require('fetch-retry')
const fetchRetry = fetchRT(fetch)

return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(settings),
retries: 3,
retryDelay: 500,
})
.then((res: any) => {
log(`[SERVER]::Debug: Load model request with response ${JSON.stringify(res)}`)
return Promise.resolve(res)
})
.catch((err: any) => {
log(`[SERVER]::Error: Load model failed with error ${err}`)
return Promise.reject(err)
})
}

/**
* Stop model and kill nitro process.
*/
export const stopModel = async (_modelId: string) => {
if (!subprocess) {
return {
error: "Model isn't running",
}
}
return new Promise((resolve, reject) => {
const controller = new AbortController()
setTimeout(() => {
controller.abort()
reject({
error: 'Failed to stop model: Timedout',
// Just reuse the cortex extension implementation, don't duplicate then lost of sync
return import(module).then((extension) =>
extension
.unloadModel()
.then(() => log(`[SERVER]::Debug: Model is unloaded`))
.then({
message: 'Model stopped',
})
}, 5000)
const tcpPortUsed = require('tcp-port-used')
log(`[SERVER]::Debug: Request to kill cortex`)

fetch(NITRO_HTTP_KILL_URL, {
method: 'DELETE',
signal: controller.signal,
})
.then(() => {
subprocess?.kill()
subprocess = undefined
})
.catch(() => {
// don't need to do anything, we still kill the subprocess
})
.then(() => tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000))
.then(() => log(`[SERVER]::Debug: Nitro process is terminated`))
.then(() =>
resolve({
message: 'Model stopped',
})
)
})
)
}
Loading

0 comments on commit 16fb2f1

Please sign in to comment.