runpod-workers · gabewillen · Nov 22, 2024 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/.github/workflows/CI-test_handler.yml b/.github/workflows/CI-test_handler.yml
diff --git a/Dockerfile b/Dockerfile
@@ -1,22 +1,23 @@
-ARG WORKER_CUDA_VERSION=12.1.0
+# ATTENTION: is this still the right CUDA version?
+ARG WORKER_CUDA_VERSION=12.1.0 
 FROM runpod/base:0.6.2-cuda${WORKER_CUDA_VERSION}
 
+RUN apt-get update && apt-get dist-upgrade -y
 #Reinitialize, as its lost after the FROM command
+# &efron: this doesn't quite follow to me. 
 ARG WORKER_CUDA_VERSION=12.1.0
 
-# Python dependencies
-COPY builder/requirements.txt /requirements.txt
-RUN python3.11 -m pip install --upgrade pip && \
-    python3.11 -m pip install -r /requirements.txt --no-cache-dir && \
-    rm /requirements.txt
+# Python dependencies.
 
-RUN pip uninstall torch -y && \
-    CUDA_VERSION_SHORT=$(echo ${WORKER_CUDA_VERSION} | cut -d. -f1,2 | tr -d .) && \
-    pip install --pre torch==2.4.0.dev20240518+cu${CUDA_VERSION_SHORT} --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_SHORT} --no-cache-dir
+RUN --mount=type=cache,target=/root/.cache/pip python3.11 -m pip install --upgrade pip
 
-ENV HF_HOME=/runpod-volume
+# we're always going to do this. important to do this FIRST - it can take >2m and we want to cache the result as early as possible.
+# TODO: pin this to a specific version 
 
-# Add src files (Worker Template)
-ADD src .
+RUN --mount=type=cache,target=/root/.cache/pip python3.11 -m pip install torch torchvision torchaudio
 
-CMD python3.11 -u /handler.py
+# ourother requirements may change; updating the version of infinity embedding, for instance.
+COPY builder/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip python3.11 -m pip install -r /requirements.txt
+COPY ./src /src
+CMD python3.11 -u /src/handler.py
diff --git a/README.md b/README.md
@@ -32,13 +32,35 @@ You can directly use the following docker images and configure them via Environm
 
 **[NOTE]** Latest image version (pre) `runpod/worker-infinity-text-embedding:0.0.1-cuda12.1.0`
 ### 2. Select your models and configure your deployment with Environment Variables
-* `MODEL_NAMES`
-
+* `MODEL_NAMES` ⚠️ **DEPRECATED**
+    > [!WARNING]
+    > This environment variable is deprecated. Please use `RUNPOD_HUGGINGFACE_MODEL` instead.
+
     HuggingFace repo of a single model or multiple models separated by semicolon.      
 
     - Examples:
         - **Single** Model: `BAAI/bge-small-en-v1.5`
         - **Multiple** Models: `BAAI/bge-small-en-v1.5;intfloat/e5-large-v2;`
+* `RUNPOD_HUGGINGFACE_MODEL`
+
+    HuggingFace model repository path in the format `user/model` or `user/model:revision` to specify a particular model version. Models will be automatically downloaded and cached before container startup.
+
+    - Examples:
+        - **Single** Model: `BAAI/bge-small-en-v1.5`
+        - **Multiple** Models: `BAAI/bge-small-en-v1.5,intfloat/e5-large-v2`
+* `RUNPOD_HUGGINGFACE_TOKEN`
+
+    HuggingFace token for accessing private models.
+
+    > [!WARNING]
+    > Both `RUNPOD_HUGGINGFACE_TOKEN` and `RUNPOD_HUGGINGFACE_USER` must be provided together to access private models. Providing only one will not work.
+
+* `RUNPOD_HUGGINGFACE_USER` 
+
+    HuggingFace username for accessing private models.
+
+    > [!WARNING]
+    > Both `RUNPOD_HUGGINGFACE_TOKEN` and `RUNPOD_HUGGINGFACE_USER` must be provided together to access private models. Providing only one will not work.
 * `BATCH_SIZES`
 
     Batch Size for each model separated by semicolon. 

diff --git a/builder/requirements.txt b/builder/requirements.txt
@@ -1,4 +1,3 @@
-runpod~=1.7.0
-infinity-emb[all,onnxruntime-gpu]==0.0.53
+runpod>=1.7.7
+infinity-emb[all,onnxruntime-gpu]==0.0.75
 einops # deployment of custom code with nomic
-git+https://github.com/pytorch-labs/float8_experimental.git@f7a920d2c53db8912f2a0c1d9040dbe71a88906d
diff --git a/src/config.py b/src/config.py
@@ -1,6 +1,10 @@
 import os
 from dotenv import load_dotenv
 from functools import cached_property
+from runpod import RunPodLogger
+from urllib.parse import urljoin
+
+logger = RunPodLogger()
 
 DEFAULT_BATCH_SIZE = 32
 DEFAULT_BACKEND = "torch"
@@ -9,6 +13,33 @@
     # how many items can be in the queue
     os.environ["INFINITY_QUEUE_SIZE"] = "48000"
 
+MODEL_CACHE_PATH_TEMPLATE = "/runpod/cache/model/{path}"
+
+CONFIG_MESSAGE_TEMPLATE = "{message} [see https://github.com/runpod-workers/worker-infinity-embedding for more information]"
+
+
+def topath(raw: str) -> str:
+    raw = raw.strip()
+    if ":" in raw:
+        model, branch = raw.rsplit(":", maxsplit=1)
+    else:
+        model, branch = raw, "main"
+    if "/" not in model:
+        raise ValueError(
+            f"invalid model: expected one in the form user/model[:path], but got {model}"
+        )
+    user, model = model.rsplit("/", maxsplit=1)
+    return MODEL_CACHE_PATH_TEMPLATE.format(
+        path="/".join(c.strip("/") for c in (user, model, branch))
+    )
+
+
+def modelpaths(path: str = "") -> list[str]:
+    raw = os.environ.get("RUNPOD_HUGGINGFACE_MODEL", path)
+    if not raw:
+        return []
+    return [topath(m) for m in raw.split(",")]
+
 
 class EmbeddingServiceConfig:
     def __init__(self):
@@ -29,12 +60,24 @@ def backend(self):
 
     @cached_property
     def model_names(self) -> list[str]:
-        model_names = os.environ.get("MODEL_NAMES")
-        if not model_names:
-            raise ValueError("MODEL_NAMES environment variable is required")
-        model_names = model_names.split(";")
-        model_names = [model_name for model_name in model_names if model_name]
-        return model_names
+        # check if the legacy env var is defined
+        deprecated_model_names = os.environ.get(
+            "MODEL_NAMES", "/BAAI/bge-small-en-v1.5"
+        )
+        if not deprecated_model_names:
+            logger.warn(
+                CONFIG_MESSAGE_TEMPLATE.format(
+                    message="MODEL_NAMES is deprecated, use RUNPOD_HUGGINGFACE_MODEL"
+                )
+            )
+        cache_paths: list[str] = modelpaths(deprecated_model_names)
+        if not cache_paths:
+            raise ValueError(
+                CONFIG_MESSAGE_TEMPLATE.format(
+                    message="RUNPOD_HUGGINGFACE_MODEL environment variable is required"
+                )
+            )
+        return sorted(cache_paths)
 
     @cached_property
     def batch_sizes(self) -> list[int]:
@@ -46,7 +89,7 @@ def batch_sizes(self) -> list[int]:
     def dtypes(self) -> list[str]:
         dtypes = self._get_no_required_multi("DTYPES", "auto")
         return dtypes
-    
+
     @cached_property
     def runpod_max_concurrency(self) -> int:
         return int(os.environ.get("RUNPOD_MAX_CONCURRENCY", 300))
diff --git a/src/embedding_service.py b/src/embedding_service.py
@@ -6,7 +6,7 @@
     list_embeddings_to_response,
     to_rerank_response,
 )
-
+import os
 import asyncio
 
 
@@ -20,6 +20,7 @@ def __init__(self):
             engine_args.append(
                 EngineArgs(
                     model_name_or_path=model_name,
+                    revision=os.path.basename(model_name),
                     batch_size=batch_size,
                     engine=self.config.backend,
                     dtype=dtype,

diff --git a/test.sh b/test.sh