Fix workflow and messages

DevXT-LLC · Jan 27, 2024 · 77a1f8b · 77a1f8b
1 parent 8181461
commit 77a1f8b
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 5 deletions.
diff --git a/.github/workflows/publish-docker-dev.yml b/.github/workflows/publish-docker-dev.yml
@@ -58,7 +58,7 @@ jobs:
       - name: Get full image path
         id: get_image_path
         run: |
-          echo "IMAGE_PATH=$(echo ghcr.io/${{ env.GITHUB_USER }}/${{ env.REPO_NAME }}:${{ matrix.tag_name }}-${{ env.BRANCH_NAME }}-${{ github.sha }})" >> $GITHUB_ENV
+          echo "IMAGE_PATH=$(echo ghcr.io/${{ env.GITHUB_USER }}/${{ env.REPO_NAME }}:cpu-dev-${{ env.BRANCH_NAME }}-${{ github.sha }})" >> $GITHUB_ENV
 
   test-local-llm:
     uses: josh-xt/AGiXT/.github/workflows/operation-test-with-jupyter.yml@main

diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
@@ -2,15 +2,27 @@ version: '3.8'
 
 services:
   local-llm:
-    image: ghcr.io/josh-xt/local-llm:cpu-dev-dev
+    image: ghcr.io/josh-xt/local-llm:cpu-dev
     environment:
       - LOCAL_LLM_API_KEY=${LOCAL_LLM_API_KEY-}
-      - GPU_LAYERS=0
+      - GPU_LAYERS=${GPU_LAYERS-0}
+      - MAIN_GPU=${MAIN_GPU-0}
       - DEFAULT_MODEL=${DEFAULT_MODEL-phi-2-dpo}
       - WHISPER_MODEL=${WHISPER_MODEL-base.en}
+      - CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+      - LLAMA_CUBLAS=1
+      - CUDA_DOCKER_ARCH=all
+    restart: unless-stopped
     ports:
       - "8091:8091"
     volumes:
       - ./models:/app/models
       - ./outputs:/app/outputs
       - ./voices:/app/voices
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]
diff --git a/local_llm/LLM.py b/local_llm/LLM.py
@@ -263,13 +263,13 @@ def __init__(
         GPU_LAYERS = os.environ.get("GPU_LAYERS", 0)
         if torch.cuda.is_available() and int(GPU_LAYERS) == 0:
             vram = round(torch.cuda.get_device_properties(0).total_memory / 1024**3)
-            logging.info(f"[LLM] {vram} GB of VRAM detected.")
+            logging.info(f"[LLM] {vram}GB of VRAM detected.")
             if vram >= 48 or vram <= 2:
                 GPU_LAYERS = vram
             else:
                 GPU_LAYERS = vram * 2
         logging.info(
-            f"[LLM] Running {DEFAULT_MODEL} with {GPU_LAYERS} GPU layers and {THREADS} CPU threads available for offloading."
+            f"[LLM] Loading {DEFAULT_MODEL} with {GPU_LAYERS} GPU layers and {THREADS} CPU threads available for offloading. Please wait..."
         )
         self.params = {}
         self.model_name = model