Merge pull request #15 from Josh-XT/contextmgr

Add context manager and logging
DevXT-LLC · Jan 27, 2024 · a4ae87e · a4ae87e
2 parents 9836da9 + 77a1f8b
commit a4ae87e
Show file tree

Hide file tree

Showing 14 changed files with 155 additions and 142 deletions.
diff --git a/.env b/.env
@@ -2,3 +2,4 @@ GPU_LAYERS=0
 MAIN_GPU=0
 LOCAL_LLM_API_KEY=
 DEFAULT_MODEL=phi-2-dpo
+WHISPER_MODEL=base.en
diff --git a/.github/workflows/publish-docker-dev.yml b/.github/workflows/publish-docker-dev.yml
@@ -58,7 +58,7 @@ jobs:
       - name: Get full image path
         id: get_image_path
         run: |
-          echo "IMAGE_PATH=$(echo ghcr.io/${{ env.GITHUB_USER }}/${{ env.REPO_NAME }}:${{ matrix.tag_name }}-${{ env.BRANCH_NAME }}-${{ github.sha }})" >> $GITHUB_ENV
+          echo "IMAGE_PATH=$(echo ghcr.io/${{ env.GITHUB_USER }}/${{ env.REPO_NAME }}:cpu-dev-${{ env.BRANCH_NAME }}-${{ github.sha }})" >> $GITHUB_ENV
 
   test-local-llm:
     uses: josh-xt/AGiXT/.github/workflows/operation-test-with-jupyter.yml@main

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![GitHub](https://img.shields.io/badge/GitHub-Local%20LLM-blue?logo=github&style=plastic)](https://github.com/Josh-XT/Local-LLM) [![Dockerhub](https://img.shields.io/badge/Docker-Local%20LLM-blue?logo=docker&style=plastic)](https://hub.docker.com/r/joshxt/local-llm)
 
-Local-LLM is a simple [llama.cpp](https://github.com/ggerganov/llama.cpp) server that easily exposes a list of local language models to choose from to run on your own computer. It is designed to be as easy as possible to get started with running local models. It automatically handles downloading the model of your choice and configuring the server based on your CPU, RAM, and GPU. It also includes [OpenAI Style](https://pypi.org/project/openai/) endpoints for easy integration with other applications.
+Local-LLM is a simple [llama.cpp](https://github.com/ggerganov/llama.cpp) server that easily exposes a list of local language models to choose from to run on your own computer. It is designed to be as easy as possible to get started with running local models. It automatically handles downloading the model of your choice and configuring the server based on your CPU, RAM, and GPU. It also includes [OpenAI Style](https://pypi.org/project/openai/) endpoints for easy integration with other applications. Additional functionality is built in for voice cloning text to speech and a voice to text for easy voice communication entirely offline after the initial setup.
 
 ## Prerequisites
 
@@ -26,7 +26,9 @@ git clone https://github.com/Josh-XT/Local-LLM
 cd Local-LLM
 ```
 
-Expand Environment Setup if you would like to modify the default environment variables, otherwise skip to Usage.
+### Environment Setup
+
+Expand Environment Setup if you would like to modify the default environment variables, otherwise skip to Usage. All environment variables are optional and have useful defaults. Change the default model that starts with Local-LLM in your `.env` file.
 
 <details>
   <summary>Environment Setup (Optional)</summary>
@@ -39,10 +41,10 @@ Replace the environment variables with your desired settings. Assumptions will b
 
 - `LOCAL_LLM_API_KEY` - The API key to use for the server. If not set, the server will not require an API key when accepting requests.
 - `DEFAULT_MODEL` - The default model to use when no model is specified. Default is `phi-2-dpo`.
-- `MULTI_SERVER` - This will run two servers, one with `zephyr-7b-beta` running on GPU, and one with `phi-2-dpo` running on CPU. If set, this will run both, otherwise it will only run one server.
+- `WHISPER_MODEL` - The model to use for speech-to-text. Default is `base.en`.
 - `AUTO_UPDATE` - Whether or not to automatically update Local-LLM. Default is `true`.
 - `THREADS` - The number of CPU threads Local-LLM is allowed to use. Default is `your CPU thread count minus 2`.
-- `GPU_LAYERS` (Only applicable to NVIDIA GPU) - The number of layers to use on the GPU. Default is `0`.
+- `GPU_LAYERS` (Only applicable to NVIDIA GPU) - The number of layers to use on the GPU. Default is `0`. Local-LLM will automatically determine the optimal number of layers to use based on your GPU's memory if it is set to 0 and you have an NVIDIA GPU.
 - `MAIN_GPU` (Only applicable to NVIDIA GPU) - The GPU to use for the main model. Default is `0`.
 
 </details>

diff --git a/app.py b/app.py
@@ -7,9 +7,32 @@
 from local_llm.STT import STT
 from local_llm.CTTS import CTTS
 import os
+import logging
 from dotenv import load_dotenv
 
 load_dotenv()
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "phi-2-dpo")
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base.en")
+
+CURRENT_MODEL = DEFAULT_MODEL if DEFAULT_MODEL else "phi-2-dpo"
+CURRENT_STT_MODEL = WHISPER_MODEL if WHISPER_MODEL else "base.en"
+logging.basicConfig(
+    level=os.environ.get("LOGLEVEL", "INFO"),
+    format="%(asctime)s | %(levelname)s | %(message)s",
+)
+
+logging.info(f"[CTTS] xttsv2_2.0.2 model loading. Please wait...")
+LOADED_CTTS = CTTS()
+logging.info(f"[CTTS] xttsv2_2.0.2 model loaded successfully.")
+
+logging.info(f"[STT] {CURRENT_STT_MODEL} model loading. Please wait...")
+LOADED_STT = STT(model=CURRENT_STT_MODEL)
+logging.info(f"[STT] {CURRENT_STT_MODEL} model loaded successfully.")
+
+logging.info(f"[LLM] {CURRENT_MODEL} model loading. Please wait...")
+LOADED_LLM = LLM(model=CURRENT_MODEL)
+logging.info(f"[LLM] {CURRENT_MODEL} model loaded successfully.")
+logging.info(f"[Local-LLM] Server is ready.")
 
 
 app = FastAPI(title="Local-LLM Server", docs_url="/")
@@ -20,17 +43,6 @@
     allow_methods=["*"],
     allow_headers=["*"],
 )
-DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "phi-2-dpo")
-WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base.en")
-
-CURRENT_MODEL = DEFAULT_MODEL if DEFAULT_MODEL else "phi-2-dpo"
-CURRENT_STT_MODEL = WHISPER_MODEL if WHISPER_MODEL else "base.en"
-print(f"[LLM] {CURRENT_MODEL} model loading...")
-LOADED_LLM = LLM(model=CURRENT_MODEL)
-print(f"[STT] {WHISPER_MODEL} model loading...")
-LOADED_STT = STT(model=WHISPER_MODEL)
-print(f"[CTTS] xttsv2_2.0.2 model loading...")
-LOADED_CTTS = CTTS()
 
 
 def verify_api_key(authorization: str = Header(None)):

diff --git a/docker-compose-cuda.yml b/docker-compose-cuda.yml
@@ -8,6 +8,7 @@ services:
       - GPU_LAYERS=${GPU_LAYERS-0}
       - MAIN_GPU=${MAIN_GPU-0}
       - DEFAULT_MODEL=${DEFAULT_MODEL-phi-2-dpo}
+      - WHISPER_MODEL=${WHISPER_MODEL-base.en}
       - CMAKE_ARGS="-DLLAMA_CUBLAS=on"
       - LLAMA_CUBLAS=1
       - CUDA_DOCKER_ARCH=all

diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
@@ -2,14 +2,27 @@ version: '3.8'
 
 services:
   local-llm:
-    image: ghcr.io/josh-xt/local-llm:cpu-dev-dev
+    image: ghcr.io/josh-xt/local-llm:cpu-dev
     environment:
       - LOCAL_LLM_API_KEY=${LOCAL_LLM_API_KEY-}
-      - GPU_LAYERS=0
+      - GPU_LAYERS=${GPU_LAYERS-0}
+      - MAIN_GPU=${MAIN_GPU-0}
       - DEFAULT_MODEL=${DEFAULT_MODEL-phi-2-dpo}
+      - WHISPER_MODEL=${WHISPER_MODEL-base.en}
+      - CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+      - LLAMA_CUBLAS=1
+      - CUDA_DOCKER_ARCH=all
+    restart: unless-stopped
     ports:
       - "8091:8091"
     volumes:
       - ./models:/app/models
       - ./outputs:/app/outputs
       - ./voices:/app/voices
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]
diff --git a/docker-compose-multi.yml b/docker-compose-multi.yml
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -6,6 +6,7 @@ services:
     environment:
       - LOCAL_LLM_API_KEY=${LOCAL_LLM_API_KEY-}
       - DEFAULT_MODEL=${DEFAULT_MODEL-phi-2-dpo}
+      - WHISPER_MODEL=${WHISPER_MODEL-base.en}
     restart: unless-stopped
     ports:
       - "8091:8091"

diff --git a/local_llm/CTTS.py b/local_llm/CTTS.py
@@ -5,6 +5,7 @@
 import torch
 import torchaudio
 import requests
+import logging
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 
@@ -31,7 +32,7 @@ def download_xtts():
     for filename, url in files_to_download.items():
         destination = os.path.join(os.getcwd(), "xttsv2_2.0.2", filename)
         if not os.path.exists(destination):
-            print(f"[CTTS] Downloading {filename} for XTTSv2...")
+            logging.info(f"[CTTS] Downloading {filename} for XTTSv2...")
             response = requests.get(url, stream=True)
             block_size = 1024  # 1 Kibibyte
             with open(destination, "wb") as file:

diff --git a/local_llm/LLM.py b/local_llm/LLM.py
@@ -8,23 +8,10 @@
 import json
 import psutil
 import torch
+import logging
 
 
-GPU_LAYERS = os.environ.get("GPU_LAYERS", 0)
-if torch.cuda.is_available() and int(GPU_LAYERS) == 0:
-    VRAM = round(torch.cuda.get_device_properties(0).total_memory / 1024**3)
-    print(f"[LLM] {VRAM} GB of VRAM detected.")
-    GPU_LAYERS = min(2 * max(0, (VRAM - 1) // 2), 36)
-RAM = round(psutil.virtual_memory().total / 1024**3)
-MAIN_GPU = os.environ.get("MAIN_GPU", 0)
-THREADS = os.environ.get("THREADS", psutil.cpu_count() - 2)
-DOWNLOAD_MODELS = (
-    True if os.environ.get("DOWNLOAD_MODELS", "true").lower() == "true" else False
-)
 DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "phi-2-dpo")
-print(
-    f"[LLM] Running {DEFAULT_MODEL} with {GPU_LAYERS} GPU layers and {THREADS} CPU threads available for offloading."
-)
 
 
 def get_models():
@@ -49,7 +36,10 @@ def get_models():
     return model_names
 
 
-def get_model_url(model_name=DEFAULT_MODEL):
+def get_model_url(model_name=""):
+    if model_name == "":
+        global DEFAULT_MODEL
+        model_name = DEFAULT_MODEL
     model_url = ""
     try:
         models = get_models()
@@ -78,7 +68,10 @@ def get_model_name(model_url="TheBloke/phi-2-dpo-GGUF"):
     return model_name
 
 
-def get_readme(model_name=DEFAULT_MODEL, models_dir="models"):
+def get_readme(model_name="", models_dir="models"):
+    if model_name == "":
+        global DEFAULT_MODEL
+        model_name = DEFAULT_MODEL
     model_url = get_model_url(model_name=model_name)
     model_name = model_name.lower()
     if not os.path.exists(f"{models_dir}/{model_name}/README.md"):
@@ -92,7 +85,10 @@ def get_readme(model_name=DEFAULT_MODEL, models_dir="models"):
     return readme
 
 
-def get_max_tokens(model_name=DEFAULT_MODEL, models_dir="models"):
+def get_max_tokens(model_name="", models_dir="models"):
+    if model_name == "":
+        global DEFAULT_MODEL
+        model_name = DEFAULT_MODEL
     readme = get_readme(model_name=model_name, models_dir=models_dir)
     if "200k" in readme:
         return 200000
@@ -113,7 +109,10 @@ def get_max_tokens(model_name=DEFAULT_MODEL, models_dir="models"):
     return 8192
 
 
-def get_prompt(model_name=DEFAULT_MODEL, models_dir="models"):
+def get_prompt(model_name="", models_dir="models"):
+    if model_name == "":
+        global DEFAULT_MODEL
+        model_name = DEFAULT_MODEL
     model_name = model_name.lower()
     if os.path.exists(f"{models_dir}/{model_name}/prompt.txt"):
         with open(f"{models_dir}/{model_name}/prompt.txt", "r") as f:
@@ -129,10 +128,15 @@ def get_prompt(model_name=DEFAULT_MODEL, models_dir="models"):
     return prompt_template
 
 
-def get_model(model_name=DEFAULT_MODEL, models_dir="models"):
-    global RAM
-    global DOWNLOAD_MODELS
-    if RAM > 16:
+def get_model(model_name="", models_dir="models"):
+    if model_name == "":
+        global DEFAULT_MODEL
+        model_name = DEFAULT_MODEL
+    DOWNLOAD_MODELS = (
+        True if os.environ.get("DOWNLOAD_MODELS", "true").lower() == "true" else False
+    )
+    ram = round(psutil.virtual_memory().total / 1024**3)
+    if ram > 16:
         default_quantization_type = "Q5_K_M"
     else:
         default_quantization_type = "Q4_K_M"
@@ -166,13 +170,13 @@ def get_model(model_name=DEFAULT_MODEL, models_dir="models"):
                 if model_name != "mistrallite-7b"
                 else f"https://huggingface.co/TheBloke/MistralLite-7B-GGUF/resolve/main/mistrallite.{quantization_type}.gguf"
             )
-        print(f"[LLM] Downloading {model_name}...")
+        logging.info(f"[LLM] Downloading {model_name}...")
         with requests.get(url, stream=True, allow_redirects=True) as r:
             with open(file_path, "wb") as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     f.write(chunk)
         if clip_url != "":
-            print(f"[LLM] Downloading {model_name} CLIP...")
+            logging.info(f"[LLM] Downloading {model_name} CLIP...")
             with requests.get(clip_url, stream=True, allow_redirects=True) as r:
                 with open(
                     f"{models_dir}/{model_name}/mmproj-model-f16.gguf", "wb"
@@ -253,9 +257,20 @@ def __init__(
         system_message: str = "",
         **kwargs,
     ):
-        global THREADS
-        global GPU_LAYERS
-        global MAIN_GPU
+        global DEFAULT_MODEL
+        THREADS = os.environ.get("THREADS", psutil.cpu_count() - 2)
+        MAIN_GPU = os.environ.get("MAIN_GPU", 0)
+        GPU_LAYERS = os.environ.get("GPU_LAYERS", 0)
+        if torch.cuda.is_available() and int(GPU_LAYERS) == 0:
+            vram = round(torch.cuda.get_device_properties(0).total_memory / 1024**3)
+            logging.info(f"[LLM] {vram}GB of VRAM detected.")
+            if vram >= 48 or vram <= 2:
+                GPU_LAYERS = vram
+            else:
+                GPU_LAYERS = vram * 2
+        logging.info(
+            f"[LLM] Loading {DEFAULT_MODEL} with {GPU_LAYERS} GPU layers and {THREADS} CPU threads available for offloading. Please wait..."
+        )
         self.params = {}
         self.model_name = model
         if model != "":
@@ -394,5 +409,5 @@ def models(self):
 
 
 if __name__ == "__main__":
-    print(f"[LLM] Downloading {DEFAULT_MODEL} model...")
+    logging.info(f"[LLM] Downloading {DEFAULT_MODEL} model...")
     get_model(model_name=DEFAULT_MODEL, models_dir="models")
diff --git a/local_llm/STT.py b/local_llm/STT.py
@@ -3,6 +3,7 @@
 import io
 import requests
 import uuid
+import logging
 from whisper_cpp import Whisper
 from pydub import AudioSegment
 
@@ -52,7 +53,7 @@ async def transcribe_audio(self, base64_audio, audio_format="m4a"):
             raise RuntimeError(f"Failed to load audio.")
         self.w.transcribe(file_path)
         user_input = self.w.output(output_txt=False)
-        print(f"[STT] Transcribed User Input: {user_input}")
+        logging.info(f"[STT] Transcribed User Input: {user_input}")
         user_input = user_input.replace("[BLANK_AUDIO]", "")
         os.remove(file_path)
         return user_input

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="local-llm",
-    version="0.1.0",
+    version="0.1.1",
     description="Local-LLM is a llama.cpp server in Docker with OpenAI Style Endpoints.",
     long_description=long_description,
     long_description_content_type="text/markdown",