diff --git a/.github/workflows/nightly-integrate-cortex-cpp.yml b/.github/workflows/nightly-integrate-cortex-cpp.yml
index e0b48bc46f..8ddc40a118 100644
--- a/.github/workflows/nightly-integrate-cortex-cpp.yml
+++ b/.github/workflows/nightly-integrate-cortex-cpp.yml
@@ -51,13 +51,13 @@ jobs:
         latest_prerelease_asset_count=$(get_asset_count "$latest_prerelease_name")
 
         if [ "$current_version_name" = "$latest_prerelease_name" ]; then
-          echo "cortex cpp remote repo doesn't have update today, skip update cortex-cpp for today nightly build"
+          echo "cortex cpp remote repo doesn't have update today, skip update cortex.cpp for today nightly build"
           echo "::set-output name=pr_created::false"
           exit 0
         fi
 
         if [ "$current_version_asset_count" != "$latest_prerelease_asset_count" ]; then
-          echo "Latest prerelease version has different number of assets, somethink went wrong, skip update cortex-cpp for today nightly build"
+          echo "Latest prerelease version has different number of assets, somethink went wrong, skip update cortex.cpp for today nightly build"
           echo "::set-output name=pr_created::false"
           exit 1
         fi
diff --git a/core/src/node/api/restful/helper/startStopModel.ts b/core/src/node/api/restful/helper/startStopModel.ts
index 8665850da8..d1a23dca90 100644
--- a/core/src/node/api/restful/helper/startStopModel.ts
+++ b/core/src/node/api/restful/helper/startStopModel.ts
@@ -1,31 +1,13 @@
-import fs from 'fs'
 import { join } from 'path'
-import {
-  getJanDataFolderPath,
-  getJanExtensionsPath,
-  getSystemResourceInfo,
-  log,
-} from '../../../helper'
-import { ChildProcessWithoutNullStreams, spawn } from 'child_process'
-import { Model, ModelSettingParams, PromptTemplate } from '../../../../types'
-import {
-  LOCAL_HOST,
-  NITRO_DEFAULT_PORT,
-  NITRO_HTTP_KILL_URL,
-  NITRO_HTTP_LOAD_MODEL_URL,
-  NITRO_HTTP_VALIDATE_MODEL_URL,
-  SUPPORTED_MODEL_FORMAT,
-} from './consts'
-
-// The subprocess instance for Nitro
-let subprocess: ChildProcessWithoutNullStreams | undefined = undefined
-
-// TODO: move this to core type
-interface NitroModelSettings extends ModelSettingParams {
-  llama_model_path: string
-  cpu_threads: number
-}
+import { getJanDataFolderPath, getJanExtensionsPath, log } from '../../../helper'
+import { ModelSettingParams } from '../../../../types'
 
+/**
+ * Start a model
+ * @param modelId
+ * @param settingParams
+ * @returns
+ */
 export const startModel = async (modelId: string, settingParams?: ModelSettingParams) => {
   try {
     await runModel(modelId, settingParams)
@@ -40,316 +22,57 @@ export const startModel = async (modelId: string, settingParams?: ModelSettingPa
   }
 }
 
-const runModel = async (modelId: string, settingParams?: ModelSettingParams): Promise<void> => {
+/**
+ * Run a model using installed cortex extension
+ * @param model
+ * @param settingParams
+ */
+const runModel = async (model: string, settingParams?: ModelSettingParams): Promise<void> => {
   const janDataFolderPath = getJanDataFolderPath()
-  const modelFolderFullPath = join(janDataFolderPath, 'models', modelId)
-
-  if (!fs.existsSync(modelFolderFullPath)) {
-    throw new Error(`Model not found: ${modelId}`)
-  }
-
-  const files: string[] = fs.readdirSync(modelFolderFullPath)
-
-  // Look for GGUF model file
-  const ggufBinFile = files.find((file) => file.toLowerCase().includes(SUPPORTED_MODEL_FORMAT))
-
-  const modelMetadataPath = join(modelFolderFullPath, 'model.json')
-  const modelMetadata: Model = JSON.parse(fs.readFileSync(modelMetadataPath, 'utf-8'))
-
-  if (!ggufBinFile) {
-    throw new Error('No GGUF model file found')
-  }
-  const modelBinaryPath = join(modelFolderFullPath, ggufBinFile)
-
-  const nitroResourceProbe = await getSystemResourceInfo()
-  const nitroModelSettings: NitroModelSettings = {
-    // This is critical and requires real CPU physical core count (or performance core)
-    cpu_threads: Math.max(1, nitroResourceProbe.numCpuPhysicalCore),
-    ...modelMetadata.settings,
-    ...settingParams,
-    llama_model_path: modelBinaryPath,
-    ...(modelMetadata.settings.mmproj && {
-      mmproj: join(modelFolderFullPath, modelMetadata.settings.mmproj),
-    }),
-  }
-
-  log(`[SERVER]::Debug: Nitro model settings: ${JSON.stringify(nitroModelSettings)}`)
-
-  // Convert settings.prompt_template to system_prompt, user_prompt, ai_prompt
-  if (modelMetadata.settings.prompt_template) {
-    const promptTemplate = modelMetadata.settings.prompt_template
-    const prompt = promptTemplateConverter(promptTemplate)
-    if (prompt?.error) {
-      throw new Error(prompt.error)
-    }
-    nitroModelSettings.system_prompt = prompt.system_prompt
-    nitroModelSettings.user_prompt = prompt.user_prompt
-    nitroModelSettings.ai_prompt = prompt.ai_prompt
-  }
-
-  await runNitroAndLoadModel(modelId, nitroModelSettings)
-}
-
-// TODO: move to util
-const promptTemplateConverter = (promptTemplate: string): PromptTemplate => {
-  // Split the string using the markers
-  const systemMarker = '{system_message}'
-  const promptMarker = '{prompt}'
-
-  if (promptTemplate.includes(systemMarker) && promptTemplate.includes(promptMarker)) {
-    // Find the indices of the markers
-    const systemIndex = promptTemplate.indexOf(systemMarker)
-    const promptIndex = promptTemplate.indexOf(promptMarker)
-
-    // Extract the parts of the string
-    const system_prompt = promptTemplate.substring(0, systemIndex)
-    const user_prompt = promptTemplate.substring(systemIndex + systemMarker.length, promptIndex)
-    const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)
-
-    // Return the split parts
-    return { system_prompt, user_prompt, ai_prompt }
-  } else if (promptTemplate.includes(promptMarker)) {
-    // Extract the parts of the string for the case where only promptMarker is present
-    const promptIndex = promptTemplate.indexOf(promptMarker)
-    const user_prompt = promptTemplate.substring(0, promptIndex)
-    const ai_prompt = promptTemplate.substring(promptIndex + promptMarker.length)
-
-    // Return the split parts
-    return { user_prompt, ai_prompt }
-  }
-
-  // Return an error if none of the conditions are met
-  return { error: 'Cannot split prompt template' }
-}
-
-const runNitroAndLoadModel = async (modelId: string, modelSettings: NitroModelSettings) => {
-  // Gather system information for CPU physical cores and memory
-  const tcpPortUsed = require('tcp-port-used')
-
-  await stopModel(modelId)
-  await tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000)
-
-  /**
-   * There is a problem with Windows process manager
-   * Should wait for awhile to make sure the port is free and subprocess is killed
-   * The tested threshold is 500ms
-   **/
-  if (process.platform === 'win32') {
-    await new Promise((resolve) => setTimeout(resolve, 500))
-  }
-
-  await spawnNitroProcess()
-  await loadLLMModel(modelSettings)
-  await validateModelStatus()
-}
-
-const spawnNitroProcess = async (): Promise<void> => {
-  log(`[SERVER]::Debug: Spawning cortex subprocess...`)
-
-  let binaryFolder = join(
+  const modelFolder = join(janDataFolderPath, 'models', model)
+  let module = join(
     getJanExtensionsPath(),
     '@janhq',
     'inference-cortex-extension',
     'dist',
-    'bin'
-  )
-
-  let executableOptions = executableNitroFile()
-  const tcpPortUsed = require('tcp-port-used')
-
-  const args: string[] = ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()]
-  // Execute the binary
-  log(
-    `[SERVER]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
+    'node',
+    'index.cjs'
   )
-  subprocess = spawn(
-    executableOptions.executablePath,
-    ['1', LOCAL_HOST, NITRO_DEFAULT_PORT.toString()],
-    {
-      cwd: binaryFolder,
-      env: {
-        ...process.env,
-        CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
-      },
-    }
+  // Just reuse the cortex extension implementation, don't duplicate then lost of sync
+  return import(module).then((extension) =>
+    extension
+      .loadModel(
+        {
+          modelFolder,
+          model,
+        },
+        settingParams
+      )
+      .then(() => log(`[SERVER]::Debug: Model is loaded`))
+      .then({
+        message: 'Model started',
+      })
   )
-
-  // Handle subprocess output
-  subprocess.stdout.on('data', (data: any) => {
-    log(`[SERVER]::Debug: ${data}`)
-  })
-
-  subprocess.stderr.on('data', (data: any) => {
-    log(`[SERVER]::Error: ${data}`)
-  })
-
-  subprocess.on('close', (code: any) => {
-    log(`[SERVER]::Debug: cortex exited with code: ${code}`)
-    subprocess = undefined
-  })
-
-  tcpPortUsed.waitUntilUsed(NITRO_DEFAULT_PORT, 300, 30000).then(() => {
-    log(`[SERVER]::Debug: cortex is ready`)
-  })
-}
-
-type NitroExecutableOptions = {
-  executablePath: string
-  cudaVisibleDevices: string
 }
-
-const executableNitroFile = (): NitroExecutableOptions => {
-  const nvidiaInfoFilePath = join(getJanDataFolderPath(), 'settings', 'settings.json')
-  let binaryFolder = join(
+/*
+ * Stop model and kill nitro process.
+ */
+export const stopModel = async (_modelId: string) => {
+  let module = join(
     getJanExtensionsPath(),
     '@janhq',
     'inference-cortex-extension',
     'dist',
-    'bin'
+    'node',
+    'index.cjs'
   )
-
-  let cudaVisibleDevices = ''
-  let binaryName = 'cortex-cpp'
-  /**
-   * The binary folder is different for each platform.
-   */
-  if (process.platform === 'win32') {
-    /**
-     *  For Windows: win-cpu, win-cuda-11-7, win-cuda-12-0
-     */
-    let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
-    if (nvidiaInfo['run_mode'] === 'cpu') {
-      binaryFolder = join(binaryFolder, 'win-cpu')
-    } else {
-      if (nvidiaInfo['cuda'].version === '12') {
-        binaryFolder = join(binaryFolder, 'win-cuda-12-0')
-      } else {
-        binaryFolder = join(binaryFolder, 'win-cuda-11-7')
-      }
-      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
-    }
-    binaryName = 'cortex-cpp.exe'
-  } else if (process.platform === 'darwin') {
-    /**
-     *  For MacOS: mac-universal both Silicon and InteL
-     */
-    if(process.arch === 'arm64') {
-    binaryFolder = join(binaryFolder, 'mac-arm64')
-    } else {
-      binaryFolder = join(binaryFolder, 'mac-amd64')
-    }
-  } else {
-    /**
-     *  For Linux: linux-cpu, linux-cuda-11-7, linux-cuda-12-0
-     */
-    let nvidiaInfo = JSON.parse(fs.readFileSync(nvidiaInfoFilePath, 'utf-8'))
-    if (nvidiaInfo['run_mode'] === 'cpu') {
-      binaryFolder = join(binaryFolder, 'linux-cpu')
-    } else {
-      if (nvidiaInfo['cuda'].version === '12') {
-        binaryFolder = join(binaryFolder, 'linux-cuda-12-0')
-      } else {
-        binaryFolder = join(binaryFolder, 'linux-cuda-11-7')
-      }
-      cudaVisibleDevices = nvidiaInfo['gpu_highest_vram']
-    }
-  }
-
-  return {
-    executablePath: join(binaryFolder, binaryName),
-    cudaVisibleDevices,
-  }
-}
-
-const validateModelStatus = async (): Promise<void> => {
-  // Send a GET request to the validation URL.
-  // Retry the request up to 3 times if it fails, with a delay of 500 milliseconds between retries.
-  const fetchRT = require('fetch-retry')
-  const fetchRetry = fetchRT(fetch)
-
-  return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
-    method: 'GET',
-    headers: {
-      'Content-Type': 'application/json',
-    },
-    retries: 5,
-    retryDelay: 500,
-  }).then(async (res: Response) => {
-    log(`[SERVER]::Debug: Validate model state success with response ${JSON.stringify(res)}`)
-    // If the response is OK, check model_loaded status.
-    if (res.ok) {
-      const body = await res.json()
-      // If the model is loaded, return an empty object.
-      // Otherwise, return an object with an error message.
-      if (body.model_loaded) {
-        return Promise.resolve()
-      }
-    }
-    return Promise.reject('Validate model status failed')
-  })
-}
-
-const loadLLMModel = async (settings: NitroModelSettings): Promise<Response> => {
-  log(`[SERVER]::Debug: Loading model with params ${JSON.stringify(settings)}`)
-  const fetchRT = require('fetch-retry')
-  const fetchRetry = fetchRT(fetch)
-
-  return fetchRetry(NITRO_HTTP_LOAD_MODEL_URL, {
-    method: 'POST',
-    headers: {
-      'Content-Type': 'application/json',
-    },
-    body: JSON.stringify(settings),
-    retries: 3,
-    retryDelay: 500,
-  })
-    .then((res: any) => {
-      log(`[SERVER]::Debug: Load model request with response ${JSON.stringify(res)}`)
-      return Promise.resolve(res)
-    })
-    .catch((err: any) => {
-      log(`[SERVER]::Error: Load model failed with error ${err}`)
-      return Promise.reject(err)
-    })
-}
-
-/**
- * Stop model and kill nitro process.
- */
-export const stopModel = async (_modelId: string) => {
-  if (!subprocess) {
-    return {
-      error: "Model isn't running",
-    }
-  }
-  return new Promise((resolve, reject) => {
-    const controller = new AbortController()
-    setTimeout(() => {
-      controller.abort()
-      reject({
-        error: 'Failed to stop model: Timedout',
+  // Just reuse the cortex extension implementation, don't duplicate then lost of sync
+  return import(module).then((extension) =>
+    extension
+      .unloadModel()
+      .then(() => log(`[SERVER]::Debug: Model is unloaded`))
+      .then({
+        message: 'Model stopped',
       })
-    }, 5000)
-    const tcpPortUsed = require('tcp-port-used')
-    log(`[SERVER]::Debug: Request to kill cortex`)
-
-    fetch(NITRO_HTTP_KILL_URL, {
-      method: 'DELETE',
-      signal: controller.signal,
-    })
-      .then(() => {
-        subprocess?.kill()
-        subprocess = undefined
-      })
-      .catch(() => {
-        // don't need to do anything, we still kill the subprocess
-      })
-      .then(() => tcpPortUsed.waitUntilFree(NITRO_DEFAULT_PORT, 300, 5000))
-      .then(() => log(`[SERVER]::Debug: Nitro process is terminated`))
-      .then(() =>
-        resolve({
-          message: 'Model stopped',
-        })
-      )
-  })
+  )
 }
diff --git a/extensions/inference-nitro-extension/download.bat b/extensions/inference-nitro-extension/download.bat
index b7fbd32520..7acd385d53 100644
--- a/extensions/inference-nitro-extension/download.bat
+++ b/extensions/inference-nitro-extension/download.bat
@@ -1,3 +1,31 @@
 @echo off
+set BIN_PATH=./bin
 set /p CORTEX_VERSION=<./bin/version.txt
-.\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7 && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-cpu && .\node_modules\.bin\download https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz -e --strip 1 -o ./bin/win-vulkan && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/win-cuda-12-0/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/win-cuda-11-7/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-noavx.tar.gz -e --strip 1 -o ./bin/win-cpu/engines/cortex.llamacpp && .\node_modules\.bin\download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-windows-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/win-vulkan/engines/cortex.llamacpp
+
+@REM Download cortex.llamacpp binaries
+set VERSION=v0.1.25
+set DOWNLOAD_URL=https://github.com/janhq/cortex.llamacpp/releases/download/%VERSION%/cortex.llamacpp-0.1.25-windows-amd64
+set SUBFOLDERS=win-cuda-12-0 win-cuda-11-7 win-noavx win-avx win-avx2 win-avx512 win-vulkan
+
+call .\node_modules\.bin\download -e --strip 1 -o %BIN_PATH% https://github.com/janhq/cortex/releases/download/v%CORTEX_VERSION%/cortex-cpp-%CORTEX_VERSION%-windows-amd64.tar.gz
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-12-0.tar.gz -e --strip 1 -o %BIN_PATH%/win-cuda-12-0/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2-cuda-11-7.tar.gz -e --strip 1 -o %BIN_PATH%/win-cuda-11-7/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-noavx.tar.gz -e --strip 1 -o %BIN_PATH%/win-noavx/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx2.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx2/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-avx512.tar.gz -e --strip 1 -o %BIN_PATH%/win-avx512/engines/cortex.llamacpp
+call .\node_modules\.bin\download %DOWNLOAD_URL%-vulkan.tar.gz -e --strip 1 -o %BIN_PATH%/win-vulkan/engines/cortex.llamacpp
+
+@REM Loop through each folder and move DLLs (excluding engine.dll)
+for %%F in (%SUBFOLDERS%) do (
+    echo Processing folder: %BIN_PATH%\%%F
+
+    @REM Move all .dll files except engine.dll
+    for %%D in (%BIN_PATH%\%%F\engines\cortex.llamacpp\*.dll) do (
+        if /I not "%%~nxD"=="engine.dll" (
+            move "%%D" "%BIN_PATH%"
+        )
+    )
+)
+
+echo DLL files moved successfully.
\ No newline at end of file
diff --git a/extensions/inference-nitro-extension/download.sh b/extensions/inference-nitro-extension/download.sh
new file mode 100755
index 0000000000..98ed8504a4
--- /dev/null
+++ b/extensions/inference-nitro-extension/download.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Read CORTEX_VERSION
+CORTEX_VERSION=$(cat ./bin/version.txt)
+CORTEX_RELEASE_URL="https://github.com/janhq/cortex/releases/download"
+
+# Detect platform
+OS_TYPE=$(uname)
+
+if [ "$OS_TYPE" == "Linux" ]; then
+    # Linux downloads
+    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz"  -e --strip 1 -o "./bin"
+    chmod +x "./bin/cortex-cpp"
+
+    ENGINE_DOWNLOAD_URL="https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64"
+
+    # Download engines for Linux
+    download "${ENGINE_DOWNLOAD_URL}-noavx.tar.gz"  -e --strip 1 -o "./bin/linux-noavx/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx.tar.gz"  -e --strip 1 -o "./bin/linux-avx/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx2.tar.gz"  -e --strip 1 -o "./bin/linux-avx2/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx512.tar.gz"  -e --strip 1 -o "./bin/linux-avx512/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx2-cuda-12-0.tar.gz"  -e --strip 1 -o "./bin/linux-cuda-12-0/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-avx2-cuda-11-7.tar.gz"  -e --strip 1 -o "./bin/linux-cuda-11-7/engines/cortex.llamacpp" 1
+    download "${ENGINE_DOWNLOAD_URL}-vulkan.tar.gz"  -e --strip 1 -o "./bin/linux-vulkan/engines/cortex.llamacpp" 1
+
+elif [ "$OS_TYPE" == "Darwin" ]; then
+    # macOS downloads
+    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz"  -e --strip 1 -o "./bin/mac-arm64" 1
+    download "${CORTEX_RELEASE_URL}/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz"  -e --strip 1 -o "./bin/mac-x64" 1
+    chmod +x "./bin/mac-arm64/cortex-cpp"
+    chmod +x "./bin/mac-x64/cortex-cpp"
+
+    ENGINE_DOWNLOAD_URL="https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac"
+    # Download engines for macOS
+    download "${ENGINE_DOWNLOAD_URL}-arm64.tar.gz" -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp
+    download "${ENGINE_DOWNLOAD_URL}-amd64.tar.gz" -e --strip 1 -o ./bin/mac-x64/engines/cortex.llamacpp
+
+else
+    echo "Unsupported operating system: $OS_TYPE"
+    exit 1
+fi
diff --git a/extensions/inference-nitro-extension/package.json b/extensions/inference-nitro-extension/package.json
index 7be4be69a1..1e3ea6d381 100644
--- a/extensions/inference-nitro-extension/package.json
+++ b/extensions/inference-nitro-extension/package.json
@@ -2,7 +2,7 @@
   "name": "@janhq/inference-cortex-extension",
   "productName": "Cortex Inference Engine",
   "version": "1.0.15",
-  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://nitro.jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
+  "description": "This extension embeds cortex.cpp, a lightweight inference engine written in C++. See https://jan.ai.\nAdditional dependencies could be installed to run without Cuda Toolkit installation.",
   "main": "dist/index.js",
   "node": "dist/node/index.cjs.js",
   "author": "Jan <service@jan.ai>",
@@ -10,13 +10,11 @@
   "scripts": {
     "test": "jest",
     "build": "tsc --module commonjs && rollup -c rollup.config.ts",
-    "downloadnitro:linux": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cpu && chmod +x ./bin/linux-cpu/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0 && chmod +x ./bin/linux-cuda-12-0/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7 && chmod +x ./bin/linux-cuda-11-7/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-linux-amd64.tar.gz -e --strip 1 -o ./bin/linux-vulkan && chmod +x ./bin/linux-vulkan/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx.tar.gz -e --strip 1 -o ./bin/linux-cpu/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-12-0.tar.gz -e --strip 1 -o ./bin/linux-cuda-12-0/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-noavx-cuda-11-7.tar.gz -e --strip 1 -o ./bin/linux-cuda-11-7/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-linux-amd64-vulkan.tar.gz -e --strip 1 -o ./bin/linux-vulkan/engines/cortex.llamacpp",
-    "downloadnitro:darwin": "CORTEX_VERSION=$(cat ./bin/version.txt) && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-arm64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz --strip-components=1 -C ./bin/mac-arm64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-arm64.tar.gz && chmod +x ./bin/mac-arm64/cortex-cpp && download https://github.com/janhq/cortex/releases/download/v${CORTEX_VERSION}/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz -o ./bin/ && mkdir -p ./bin/mac-amd64 && tar -zxvf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz --strip-components=1 -C ./bin/mac-amd64 && rm -rf ./bin/cortex-cpp-${CORTEX_VERSION}-mac-amd64.tar.gz && chmod +x ./bin/mac-amd64/cortex-cpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-arm64.tar.gz -e --strip 1 -o ./bin/mac-arm64/engines/cortex.llamacpp && download https://github.com/janhq/cortex.llamacpp/releases/download/v0.1.25/cortex.llamacpp-0.1.25-mac-amd64.tar.gz -e --strip 1 -o ./bin/mac-amd64/engines/cortex.llamacpp",
+    "downloadnitro:linux:darwin": "./download.sh",
     "downloadnitro:win32": "download.bat",
     "downloadnitro": "run-script-os",
     "build:publish:darwin": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && ../../.github/scripts/auto-sign.sh && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:win32": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
-    "build:publish:linux": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
+    "build:publish:win32:linux": "rimraf *.tgz --glob && yarn build && npm run downloadnitro && cpx \"bin/**\" \"dist/bin\" && npm pack && cpx *.tgz ../../pre-install",
     "build:publish": "yarn test && run-script-os"
   },
   "exports": {
@@ -49,6 +47,7 @@
   },
   "dependencies": {
     "@janhq/core": "file:../../core",
+    "cpu-instructions": "^0.0.13",
     "decompress": "^4.2.1",
     "fetch-retry": "^5.0.6",
     "rxjs": "^7.8.1",
@@ -68,6 +67,7 @@
     "tcp-port-used",
     "fetch-retry",
     "@janhq/core",
-    "decompress"
+    "decompress",
+    "cpu-instructions"
   ]
 }
diff --git a/extensions/inference-nitro-extension/rollup.config.ts b/extensions/inference-nitro-extension/rollup.config.ts
index fdd11f961a..4e1731a095 100644
--- a/extensions/inference-nitro-extension/rollup.config.ts
+++ b/extensions/inference-nitro-extension/rollup.config.ts
@@ -96,7 +96,7 @@ export default [
           llama3170bJson,
           gemma22bJson,
           gemma29bJson,
-          gemma227bJson
+          gemma227bJson,
         ]),
         NODE: JSON.stringify(`${packageJson.name}/${packageJson.node}`),
         DEFAULT_SETTINGS: JSON.stringify(defaultSettingJson),
@@ -117,7 +117,10 @@ export default [
       // Allow json resolution
       json(),
       //     Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
+      typescript({
+        useTsconfigDeclarationDir: true,
+        exclude: ['**/__tests__', '**/*.test.ts'],
+      }),
       // Compile TypeScript files
       // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
       commonjs(),
@@ -139,7 +142,7 @@ export default [
       { file: 'dist/node/index.cjs.js', format: 'cjs', sourcemap: true },
     ],
     // Indicate here external modules you don't wanna include in your bundle (i.e.: 'lodash')
-    external: ['@janhq/core/node'],
+    external: ['@janhq/core/node', 'cpu-instructions'],
     watch: {
       include: 'src/node/**',
     },
@@ -147,7 +150,10 @@ export default [
       // Allow json resolution
       json(),
       // Compile TypeScript files
-      typescript({ useTsconfigDeclarationDir: true }),
+      typescript({
+        useTsconfigDeclarationDir: true,
+        exclude: ['**/__tests__', '**/*.test.ts'],
+      }),
       // Allow bundling cjs modules (unlike webpack, rollup doesn't understand cjs)
       commonjs(),
       // Allow node_modules resolution, so you can use 'external' to control
@@ -156,7 +162,6 @@ export default [
       resolve({
         extensions: ['.ts', '.js', '.json'],
       }),
-
       // Resolve source maps to the original source
       sourceMaps(),
     ],
diff --git a/extensions/inference-nitro-extension/src/index.ts b/extensions/inference-nitro-extension/src/index.ts
index a027e88449..d79e076d4e 100644
--- a/extensions/inference-nitro-extension/src/index.ts
+++ b/extensions/inference-nitro-extension/src/index.ts
@@ -73,6 +73,7 @@ export default class JanInferenceNitroExtension extends LocalOAIEngine {
     this.registerModels(models)
     super.onLoad()
 
+    // Add additional dependencies PATH to the env
     executeOnMain(NODE, 'addAdditionalDependencies', {
       name: this.name,
       version: this.version,
diff --git a/extensions/inference-nitro-extension/src/node/execute.test.ts b/extensions/inference-nitro-extension/src/node/execute.test.ts
index cf9e84acf7..dfd8b35a96 100644
--- a/extensions/inference-nitro-extension/src/node/execute.test.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.test.ts
@@ -1,7 +1,7 @@
 import { describe, expect, it } from '@jest/globals'
 import { executableNitroFile } from './execute'
 import { GpuSetting } from '@janhq/core'
-import { sep } from 'path'
+import { cpuInfo } from 'cpu-instructions'
 
 let testSettings: GpuSetting = {
   run_mode: 'cpu',
@@ -22,6 +22,14 @@ let testSettings: GpuSetting = {
 }
 const originalPlatform = process.platform
 
+jest.mock('cpu-instructions', () => ({
+  cpuInfo: {
+    cpuInfo: jest.fn(),
+  },
+}))
+let mock = cpuInfo.cpuInfo as jest.Mock
+mock.mockReturnValue([])
+
 describe('test executable nitro file', () => {
   afterAll(function () {
     Object.defineProperty(process, 'platform', {
@@ -38,17 +46,19 @@ describe('test executable nitro file', () => {
     })
     expect(executableNitroFile(testSettings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-arm64${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`mac-arm64`),
+        executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-arm64/cortex-cpp`) : expect.anything(),
         cudaVisibleDevices: '',
         vkVisibleDevices: '',
       })
     )
     Object.defineProperty(process, 'arch', {
-      value: 'amd64',
+      value: 'x64',
     })
     expect(executableNitroFile(testSettings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`mac-amd64${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`mac-x64`),
+        executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-x64/cortex-cpp`) : expect.anything(),
         cudaVisibleDevices: '',
         vkVisibleDevices: '',
       })
@@ -62,14 +72,11 @@ describe('test executable nitro file', () => {
     const settings: GpuSetting = {
       ...testSettings,
       run_mode: 'cpu',
-      cuda: {
-        exist: true,
-        version: '11',
-      },
     }
     expect(executableNitroFile(settings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cpu${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win`),
+        executablePath: expect.stringContaining(`cortex-cpp.exe`),
         cudaVisibleDevices: '',
         vkVisibleDevices: '',
       })
@@ -102,7 +109,8 @@ describe('test executable nitro file', () => {
     }
     expect(executableNitroFile(settings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-11-7${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win-cuda-11-7`),
+        executablePath: expect.stringContaining(`cortex-cpp.exe`),
         cudaVisibleDevices: '0',
         vkVisibleDevices: '0',
       })
@@ -135,7 +143,8 @@ describe('test executable nitro file', () => {
     }
     expect(executableNitroFile(settings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`win-cuda-12-0${sep}cortex-cpp.exe`),
+        enginePath: expect.stringContaining(`win-cuda-12-0`),
+        executablePath: expect.stringContaining(`cortex-cpp.exe`),
         cudaVisibleDevices: '0',
         vkVisibleDevices: '0',
       })
@@ -152,7 +161,8 @@ describe('test executable nitro file', () => {
     }
     expect(executableNitroFile(settings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cpu${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux`),
+        executablePath: expect.stringContaining(`cortex-cpp`),
         cudaVisibleDevices: '',
         vkVisibleDevices: '',
       })
@@ -185,7 +195,8 @@ describe('test executable nitro file', () => {
     }
     expect(executableNitroFile(settings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-11-7${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux-cuda-11-7`),
+        executablePath: expect.stringContaining(`cortex-cpp`),
         cudaVisibleDevices: '0',
         vkVisibleDevices: '0',
       })
@@ -218,10 +229,203 @@ describe('test executable nitro file', () => {
     }
     expect(executableNitroFile(settings)).toEqual(
       expect.objectContaining({
-        executablePath: expect.stringContaining(`linux-cuda-12-0${sep}cortex-cpp`),
+        enginePath: expect.stringContaining(`linux-cuda-12-0`),
+        executablePath: expect.stringContaining(`cortex-cpp`),
         cudaVisibleDevices: '0',
         vkVisibleDevices: '0',
       })
     )
   })
+
+  // Generate test for different cpu instructions on Linux
+  it(`executes on Linux CPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'linux',
+    })
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'cpu',
+    }
+
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`linux-${instruction}`),
+          executablePath: expect.stringContaining(`cortex-cpp`),
+
+          cudaVisibleDevices: '',
+          vkVisibleDevices: '',
+        })
+      )
+    })
+  })
+  // Generate test for different cpu instructions on Windows
+  it(`executes on Windows CPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'win32',
+    })
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'cpu',
+    }
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`win-${instruction}`),
+          executablePath: expect.stringContaining(`cortex-cpp.exe`),
+          cudaVisibleDevices: '',
+          vkVisibleDevices: '',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on Windows
+  it(`executes on Windows GPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'win32',
+    })
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'gpu',
+      cuda: {
+        exist: true,
+        version: '12',
+      },
+      nvidia_driver: {
+        exist: true,
+        version: '12',
+      },
+      gpus_in_use: ['0'],
+      gpus: [
+        {
+          id: '0',
+          name: 'NVIDIA GeForce GTX 1080',
+          vram: '80000000',
+        },
+      ],
+    }
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`win-cuda-12-0`),
+          executablePath: expect.stringContaining(`cortex-cpp.exe`),
+          cudaVisibleDevices: '0',
+          vkVisibleDevices: '0',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on Linux
+  it(`executes on Linux GPU with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'linux',
+    })
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'gpu',
+      cuda: {
+        exist: true,
+        version: '12',
+      },
+      nvidia_driver: {
+        exist: true,
+        version: '12',
+      },
+      gpus_in_use: ['0'],
+      gpus: [
+        {
+          id: '0',
+          name: 'NVIDIA GeForce GTX 1080',
+          vram: '80000000',
+        },
+      ],
+    }
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`linux-cuda-12-0`),
+          executablePath: expect.stringContaining(`cortex-cpp`),
+          cudaVisibleDevices: '0',
+          vkVisibleDevices: '0',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on Linux
+  it(`executes on Linux Vulkan should not have CPU instructions included`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'linux',
+    })
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    const settings: GpuSetting = {
+      ...testSettings,
+      run_mode: 'gpu',
+      vulkan: true,
+      cuda: {
+        exist: true,
+        version: '12',
+      },
+      nvidia_driver: {
+        exist: true,
+        version: '12',
+      },
+      gpus_in_use: ['0'],
+      gpus: [
+        {
+          id: '0',
+          name: 'NVIDIA GeForce GTX 1080',
+          vram: '80000000',
+        },
+      ],
+    }
+    cpuInstructions.forEach((instruction) => {
+      mock.mockReturnValue([instruction])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`linux-vulkan`),
+          executablePath: expect.stringContaining(`cortex-cpp`),
+          cudaVisibleDevices: '0',
+          vkVisibleDevices: '0',
+        })
+      )
+    })
+  })
+
+  // Generate test for different cpu instructions on MacOS
+  it(`executes on MacOS with different instructions`, () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'darwin',
+    })
+    const cpuInstructions = ['avx512', 'avx2', 'avx', 'noavx']
+    cpuInstructions.forEach(() => {
+      Object.defineProperty(process, 'platform', {
+        value: 'darwin',
+      })
+      const settings: GpuSetting = {
+        ...testSettings,
+        run_mode: 'cpu',
+      }
+      mock.mockReturnValue([])
+      expect(executableNitroFile(settings)).toEqual(
+        expect.objectContaining({
+          enginePath: expect.stringContaining(`mac-x64`),
+          executablePath: originalPlatform === 'darwin' ? expect.stringContaining(`mac-x64/cortex-cpp`) : expect.anything(),
+          cudaVisibleDevices: '',
+          vkVisibleDevices: '',
+        })
+      )
+    })
+  })
 })
diff --git a/extensions/inference-nitro-extension/src/node/execute.ts b/extensions/inference-nitro-extension/src/node/execute.ts
index 417734afa7..595063ed48 100644
--- a/extensions/inference-nitro-extension/src/node/execute.ts
+++ b/extensions/inference-nitro-extension/src/node/execute.ts
@@ -1,37 +1,59 @@
 import { GpuSetting } from '@janhq/core'
 import * as path from 'path'
+import { cpuInfo } from 'cpu-instructions'
 
 export interface NitroExecutableOptions {
+  enginePath: string
   executablePath: string
   cudaVisibleDevices: string
   vkVisibleDevices: string
 }
-const runMode = (settings?: GpuSetting): string => {
+/**
+ * The GPU runMode that will be set - either 'vulkan', 'cuda', or empty for cpu.
+ * @param settings
+ * @returns
+ */
+const gpuRunMode = (settings?: GpuSetting): string => {
   if (process.platform === 'darwin')
     // MacOS now has universal binaries
     return ''
 
-  if (!settings) return 'cpu'
+  if (!settings) return ''
 
   return settings.vulkan === true
     ? 'vulkan'
     : settings.run_mode === 'cpu'
-      ? 'cpu'
+      ? ''
       : 'cuda'
 }
 
+/**
+ * The OS & architecture that the current process is running on.
+ * @returns win, mac-x64, mac-arm64, or linux
+ */
 const os = (): string => {
   return process.platform === 'win32'
     ? 'win'
     : process.platform === 'darwin'
-      ? process.arch === 'arm64' ? 'mac-arm64' : 'mac-amd64'
+      ? process.arch === 'arm64'
+        ? 'mac-arm64'
+        : 'mac-x64'
       : 'linux'
 }
 
+/**
+ * The cortex.cpp extension based on the current platform.
+ * @returns .exe if on Windows, otherwise an empty string.
+ */
 const extension = (): '.exe' | '' => {
   return process.platform === 'win32' ? '.exe' : ''
 }
 
+/**
+ * The CUDA version that will be set - either '11-7' or '12-0'.
+ * @param settings
+ * @returns
+ */
 const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
   const isUsingCuda =
     settings?.vulkan !== true && settings?.run_mode === 'gpu' && os() !== 'mac'
@@ -40,6 +62,21 @@ const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
   return settings?.cuda?.version === '11' ? '11-7' : '12-0'
 }
 
+/**
+ * The CPU instructions that will be set - either 'avx512', 'avx2', 'avx', or 'noavx'.
+ * @returns
+ */
+const cpuInstructions = () => {
+  if (process.platform === 'darwin') return ''
+  return cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX512')
+    ? 'avx512'
+    : cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX2')
+      ? 'avx2'
+      : cpuInfo.cpuInfo().some((e) => e.toUpperCase() === 'AVX')
+        ? 'avx'
+        : 'noavx'
+}
+
 /**
  * Find which executable file to run based on the current platform.
  * @returns The name of the executable file to run.
@@ -47,15 +84,26 @@ const cudaVersion = (settings?: GpuSetting): '11-7' | '12-0' | undefined => {
 export const executableNitroFile = (
   gpuSetting?: GpuSetting
 ): NitroExecutableOptions => {
-  let binaryFolder = [os(), runMode(gpuSetting), cudaVersion(gpuSetting)]
+  let engineFolder = [
+    os(),
+    ...(gpuSetting?.vulkan
+      ? []
+      : [
+          gpuRunMode(gpuSetting) !== 'cuda' ? cpuInstructions() : '',
+          gpuRunMode(gpuSetting),
+          cudaVersion(gpuSetting),
+        ]),
+    gpuSetting?.vulkan ? 'vulkan' : undefined,
+  ]
     .filter((e) => !!e)
     .join('-')
   let cudaVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
   let vkVisibleDevices = gpuSetting?.gpus_in_use.join(',') ?? ''
-  let binaryName = `cortex-cpp${extension()}`
+  let binaryName = `${process.platform === 'darwin' ? `${os()}/` : ''}cortex-cpp${extension()}`
 
   return {
-    executablePath: path.join(__dirname, '..', 'bin', binaryFolder, binaryName),
+    enginePath: path.join(__dirname, '..', 'bin', engineFolder),
+    executablePath: path.join(__dirname, '..', 'bin', binaryName),
     cudaVisibleDevices,
     vkVisibleDevices,
   }
diff --git a/extensions/inference-nitro-extension/src/node/index.test.ts b/extensions/inference-nitro-extension/src/node/index.test.ts
new file mode 100644
index 0000000000..6e64b4a060
--- /dev/null
+++ b/extensions/inference-nitro-extension/src/node/index.test.ts
@@ -0,0 +1,465 @@
+jest.mock('fetch-retry', () => ({
+  default: () => () => {
+    return Promise.resolve({
+      ok: true,
+      status: 200,
+      json: () =>
+        Promise.resolve({
+          model_loaded: true,
+        }),
+      text: () => Promise.resolve(''),
+    })
+  },
+}))
+
+jest.mock('path', () => ({
+  default: {
+    isAbsolute: jest.fn(),
+    join: jest.fn(),
+    parse: () => {
+      return { dir: 'dir' }
+    },
+    delimiter: { concat: () => '' },
+  },
+}))
+
+jest.mock('decompress', () => ({
+  default: () => {
+    return Promise.resolve()
+  },
+}))
+
+jest.mock('@janhq/core/node', () => ({
+  ...jest.requireActual('@janhq/core/node'),
+  getJanDataFolderPath: () => '',
+  getSystemResourceInfo: () => {
+    return {
+      cpu: {
+        cores: 1,
+        logicalCores: 1,
+        threads: 1,
+        model: 'model',
+        speed: 1,
+      },
+      memory: {
+        total: 1,
+        free: 1,
+      },
+      gpu: {
+        model: 'model',
+        memory: 1,
+        cuda: {
+          version: 'version',
+          devices: 'devices',
+        },
+        vulkan: {
+          version: 'version',
+          devices: 'devices',
+        },
+      },
+    }
+  },
+}))
+
+jest.mock('fs', () => ({
+  default: {
+    readdirSync: () => [],
+  },
+}))
+
+jest.mock('child_process', () => ({
+  exec: () => {
+    return {
+      stdout: { on: jest.fn() },
+      stderr: { on: jest.fn() },
+      on: jest.fn(),
+    }
+  },
+  spawn: () => {
+    return {
+      stdout: { on: jest.fn() },
+      stderr: { on: jest.fn() },
+      on: jest.fn(),
+      pid: '111',
+    }
+  },
+}))
+
+jest.mock('tcp-port-used', () => ({
+  default: {
+    waitUntilFree: () => Promise.resolve(true),
+    waitUntilUsed: () => Promise.resolve(true),
+  },
+}))
+
+jest.mock('./execute', () => ({
+  executableNitroFile: () => {
+    return {
+      enginePath: 'enginePath',
+      executablePath: 'executablePath',
+      cudaVisibleDevices: 'cudaVisibleDevices',
+      vkVisibleDevices: 'vkVisibleDevices',
+    }
+  },
+}))
+
+jest.mock('terminate', () => ({
+  default: (id: String, func: Function) => {
+    console.log(id)
+    func()
+  },
+}))
+
+import * as execute from './execute'
+import index from './index'
+
+let executeMock = execute
+
+const modelInitOptions: any = {
+  modelFolder: '/path/to/model',
+  model: {
+    id: 'test',
+    name: 'test',
+    engine: 'nitro',
+    version: '0.0',
+    format: 'GGUF',
+    object: 'model',
+    sources: [],
+    created: 0,
+    description: 'test',
+    parameters: {},
+    metadata: {
+      author: '',
+      tags: [],
+      size: 0,
+    },
+    settings: {
+      prompt_template: '{prompt}',
+      llama_model_path: 'model.gguf',
+    },
+  },
+}
+
+describe('loadModel', () => {
+  it('should load a model successfully', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+
+    // Call the loadModel function
+    const result = await index.loadModel(modelInitOptions, systemInfo)
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+
+  it('should reject with an error message if the model is not a nitro model', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+    modelInitOptions.model.engine = 'not-nitro'
+    // Call the loadModel function
+    try {
+      await index.loadModel(modelInitOptions, systemInfo)
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Not a cortex model')
+    }
+    modelInitOptions.model.engine = 'nitro'
+  })
+
+  it('should reject if model load failed with an error message', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+    // Mock the fetch-retry module to return a failed response
+    jest.mock('fetch-retry', () => ({
+      default: () => () => {
+        return Promise.resolve({
+          ok: false,
+          status: 500,
+          json: () =>
+            Promise.resolve({
+              model_loaded: false,
+            }),
+          text: () => Promise.resolve('Failed to load model'),
+        })
+      },
+    }))
+
+    // Call the loadModel function
+    try {
+      await index.loadModel(modelInitOptions, systemInfo)
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Failed to load model')
+    }
+  })
+
+  it('should reject if port not available', async () => {
+    // Mock the necessary parameters and system information
+
+    const systemInfo = {
+      // Mock the system information if needed
+    }
+
+    // Mock the tcp-port-used module to return false
+    jest.mock('tcp-port-used', () => ({
+      default: {
+        waitUntilFree: () => Promise.resolve(false),
+        waitUntilUsed: () => Promise.resolve(false),
+      },
+    }))
+
+    // Call the loadModel function
+    try {
+      await index.loadModel(modelInitOptions, systemInfo)
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Port not available')
+    }
+  })
+
+  it('should run on GPU model if ngl is set', async () => {
+    const systemInfo: any = {
+      gpuSetting: {
+        run_mode: 'gpu',
+      },
+    }
+    // Spy executableNitroFile
+    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
+      enginePath: '',
+      executablePath: '',
+      cudaVisibleDevices: '',
+      vkVisibleDevices: '',
+    })
+
+    Object.defineProperty(process, 'platform', { value: 'win32' })
+    await index.loadModel(
+      {
+        ...modelInitOptions,
+        model: {
+          ...modelInitOptions.model,
+          settings: {
+            ...modelInitOptions.model.settings,
+            ngl: 40,
+          },
+        },
+      },
+      systemInfo
+    )
+    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
+      run_mode: 'gpu',
+    })
+  })
+
+  it('should run on correct CPU instructions if ngl is not set', async () => {
+    const systemInfo: any = {
+      gpuSetting: {
+        run_mode: 'gpu',
+      },
+    }
+    // Spy executableNitroFile
+    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
+      enginePath: '',
+      executablePath: '',
+      cudaVisibleDevices: '',
+      vkVisibleDevices: '',
+    })
+
+    Object.defineProperty(process, 'platform', { value: 'win32' })
+    await index.loadModel(
+      {
+        ...modelInitOptions,
+        model: {
+          ...modelInitOptions.model,
+          settings: {
+            ...modelInitOptions.model.settings,
+            ngl: undefined,
+          },
+        },
+      },
+      systemInfo
+    )
+    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
+      run_mode: 'cpu',
+    })
+  })
+
+  it('should run on correct CPU instructions if ngl is 0', async () => {
+    const systemInfo: any = {
+      gpuSetting: {
+        run_mode: 'gpu',
+      },
+    }
+    // Spy executableNitroFile
+    jest.spyOn(executeMock, 'executableNitroFile').mockReturnValue({
+      enginePath: '',
+      executablePath: '',
+      cudaVisibleDevices: '',
+      vkVisibleDevices: '',
+    })
+
+    Object.defineProperty(process, 'platform', { value: 'win32' })
+    await index.loadModel(
+      {
+        ...modelInitOptions,
+        model: {
+          ...modelInitOptions.model,
+          settings: {
+            ...modelInitOptions.model.settings,
+            ngl: 0,
+          },
+        },
+      },
+      systemInfo
+    )
+    expect(executeMock.executableNitroFile).toHaveBeenCalledWith({
+      run_mode: 'cpu',
+    })
+  })
+})
+
+describe('unloadModel', () => {
+  it('should unload a model successfully', async () => {
+    // Call the unloadModel function
+    const result = await index.unloadModel()
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+
+  it('should reject with an error message if the model is not a nitro model', async () => {
+    // Call the unloadModel function
+    try {
+      await index.unloadModel()
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Not a cortex model')
+    }
+  })
+
+  it('should reject if model unload failed with an error message', async () => {
+    // Mock the fetch-retry module to return a failed response
+    jest.mock('fetch-retry', () => ({
+      default: () => () => {
+        return Promise.resolve({
+          ok: false,
+          status: 500,
+          json: () =>
+            Promise.resolve({
+              model_unloaded: false,
+            }),
+          text: () => Promise.resolve('Failed to unload model'),
+        })
+      },
+    }))
+
+    // Call the unloadModel function
+    try {
+      await index.unloadModel()
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Failed to unload model')
+    }
+  })
+
+  it('should reject if port not available', async () => {
+    // Mock the tcp-port-used module to return false
+    jest.mock('tcp-port-used', () => ({
+      default: {
+        waitUntilFree: () => Promise.resolve(false),
+        waitUntilUsed: () => Promise.resolve(false),
+      },
+    }))
+
+    // Call the unloadModel function
+    try {
+      await index.unloadModel()
+    } catch (error) {
+      // Assert that the error message is as expected
+      expect(error).toBe('Port not available')
+    }
+  })
+})
+describe('dispose', () => {
+  it('should dispose a model successfully on Mac', async () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'darwin',
+    })
+
+    // Call the dispose function
+    const result = await index.dispose()
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+
+  it('should kill the subprocess successfully on Windows', async () => {
+    Object.defineProperty(process, 'platform', {
+      value: 'win32',
+    })
+
+    // Call the killSubprocess function
+    const result = await index.dispose()
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+})
+
+describe('getCurrentNitroProcessInfo', () => {
+  it('should return the current nitro process info', async () => {
+    // Call the getCurrentNitroProcessInfo function
+    const result = await index.getCurrentNitroProcessInfo()
+
+    // Assert that the result is as expected
+    expect(result).toEqual({
+      isRunning: true,
+    })
+  })
+})
+
+describe('decompressRunner', () => {
+  it('should decompress the runner successfully', async () => {
+    jest.mock('decompress', () => ({
+      default: () => {
+        return Promise.resolve()
+      },
+    }))
+    // Call the decompressRunner function
+    const result = await index.decompressRunner('', '')
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+  it('should not reject if decompression failed', async () => {
+    jest.mock('decompress', () => ({
+      default: () => {
+        return Promise.reject('Failed to decompress')
+      },
+    }))
+    // Call the decompressRunner function
+    const result = await index.decompressRunner('', '')
+    expect(result).toBeUndefined()
+  })
+})
+
+describe('addAdditionalDependencies', () => {
+  it('should add additional dependencies successfully', async () => {
+    // Call the addAdditionalDependencies function
+    const result = await index.addAdditionalDependencies({
+      name: 'name',
+      version: 'version',
+    })
+
+    // Assert that the result is as expected
+    expect(result).toBeUndefined()
+  })
+})
diff --git a/extensions/inference-nitro-extension/src/node/index.ts b/extensions/inference-nitro-extension/src/node/index.ts
index 77ac9af7a0..edc2d013de 100644
--- a/extensions/inference-nitro-extension/src/node/index.ts
+++ b/extensions/inference-nitro-extension/src/node/index.ts
@@ -263,10 +263,10 @@ async function validateModelStatus(modelId: string): Promise<void> {
   log(`[CORTEX]::Debug: Validating model ${modelId}`)
   return fetchRetry(NITRO_HTTP_VALIDATE_MODEL_URL, {
     method: 'POST',
-    body: JSON.stringify({ 
+    body: JSON.stringify({
       model: modelId,
       // TODO: force to use cortex llamacpp by default
-      engine: 'cortex.llamacpp'
+      engine: 'cortex.llamacpp',
     }),
     headers: {
       'Content-Type': 'application/json',
@@ -365,14 +365,37 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
   log(`[CORTEX]::Debug: Spawning cortex subprocess...`)
 
   return new Promise<void>(async (resolve, reject) => {
-    let executableOptions = executableNitroFile(systemInfo?.gpuSetting)
+    let executableOptions = executableNitroFile(
+      // If ngl is not set or equal to 0, run on CPU with correct instructions
+      systemInfo?.gpuSetting
+        ? {
+            ...systemInfo.gpuSetting,
+            run_mode:
+              currentSettings?.ngl === undefined || currentSettings.ngl === 0
+                ? 'cpu'
+                : systemInfo.gpuSetting.run_mode,
+          }
+        : undefined
+    )
 
     const args: string[] = ['1', LOCAL_HOST, PORT.toString()]
     // Execute the binary
     log(
       `[CORTEX]::Debug: Spawn cortex at path: ${executableOptions.executablePath}, and args: ${args}`
     )
-    log(path.parse(executableOptions.executablePath).dir)
+    log(`[CORTEX]::Debug: Cortex engine path: ${executableOptions.enginePath}`)
+
+    // Add engine path to the PATH and LD_LIBRARY_PATH
+    process.env.PATH = (process.env.PATH || '').concat(
+      path.delimiter,
+      executableOptions.enginePath
+    )
+    log(`[CORTEX] PATH: ${process.env.PATH}`)
+    process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
+      path.delimiter,
+      executableOptions.enginePath
+    )
+
     subprocess = spawn(
       executableOptions.executablePath,
       ['1', LOCAL_HOST, PORT.toString()],
@@ -380,6 +403,7 @@ function spawnNitroProcess(systemInfo?: SystemInformation): Promise<any> {
         cwd: path.join(path.parse(executableOptions.executablePath).dir),
         env: {
           ...process.env,
+          ENGINE_PATH: executableOptions.enginePath,
           CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
           // Vulkan - Support 1 device at a time for now
           ...(executableOptions.vkVisibleDevices?.length > 0 && {
@@ -440,12 +464,19 @@ const getCurrentNitroProcessInfo = (): NitroProcessInfo => {
 }
 
 const addAdditionalDependencies = (data: { name: string; version: string }) => {
+  log(
+    `[CORTEX]::Debug: Adding additional dependencies for ${data.name} ${data.version}`
+  )
   const additionalPath = path.delimiter.concat(
     path.join(getJanDataFolderPath(), 'engines', data.name, data.version)
   )
   // Set the updated PATH
-  process.env.PATH = (process.env.PATH || '').concat(additionalPath)
+  process.env.PATH = (process.env.PATH || '').concat(
+    path.delimiter,
+    additionalPath
+  )
   process.env.LD_LIBRARY_PATH = (process.env.LD_LIBRARY_PATH || '').concat(
+    path.delimiter,
     additionalPath
   )
 }
diff --git a/extensions/inference-nitro-extension/tsconfig.json b/extensions/inference-nitro-extension/tsconfig.json
index bada43fc7b..19d8572b51 100644
--- a/extensions/inference-nitro-extension/tsconfig.json
+++ b/extensions/inference-nitro-extension/tsconfig.json
@@ -15,5 +15,6 @@
     "importHelpers": true,
     "typeRoots": ["node_modules/@types"]
   },
-  "include": ["src"]
+  "include": ["src"],
+  "exclude": ["src/**/*.test.ts"]
 }