vllm-project · HollowMan6 · Dec 1, 2024
@@ -488,6 +488,12 @@ def _initialize_kv_caches(self) -> None:
 
         self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
+    def _destroy_kv_caches(self) -> None:
+        """Destroy the KV cache in the worker(s) without shutting down.
+        """
+        self.model_executor.stop_remote_worker_execution_loop()
+        self.model_executor.destroy_cache()
+
     @classmethod
     def _get_executor_cls(cls,
                           engine_config: VllmConfig) -> Type[ExecutorBase]:

diff --git a/vllm/executor/distributed_gpu_executor.py b/vllm/executor/distributed_gpu_executor.py
@@ -68,6 +68,11 @@ def initialize_cache(self, num_gpu_blocks: int,
                           num_gpu_blocks=num_gpu_blocks,
                           num_cpu_blocks=num_cpu_blocks)
 
+    def destroy_cache(self) -> None:
+        """Destroy the KV cache in all workers.
+        """
+        self._run_workers("destroy_cache")
+
     def execute_model(
         self,
         execute_model_req: ExecuteModelRequest,

@@ -62,6 +62,13 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise NotImplementedError
 
+    # TODO: Make this an abstract method and all executors should implement it.
+    # @abstractmethod
+    def destroy_cache(self) -> None:
+        """Destroy the KV cache.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def execute_model(
         self, execute_model_req: ExecuteModelRequest

diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
@@ -82,6 +82,11 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
 
         self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
+    def destroy_cache(self) -> None:
+        """Destroy the KV cache by invoking the underlying worker.
+        """
+        self.driver_worker.destroy_cache()
+
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
     ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -84,6 +84,19 @@ def _allocate_kv_cache(
                             device=device))
         return kv_cache
 
+    def destroy(self) -> None:
+        # Iterate over all the caches and destroy them.
+        while self.gpu_cache:
+            tensor = self.gpu_cache.pop()
+            del tensor
+
+        while self.cpu_cache:
+            tensor = self.cpu_cache.pop()
+            del tensor
+
+        import gc
+        gc.collect()
+
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
         for i in range(self.num_attention_layers):
             self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],

@@ -285,6 +285,15 @@ def initialize_cache(self, num_gpu_blocks: int,
         self._init_cache_engine()
         self._warm_up_model()
 
+    def destroy_cache(self) -> None:
+        self.cache_config.num_gpu_blocks = 0
+        self.cache_config.num_cpu_blocks = 0
+        while self.cache_engine:
+            cache_engine = self.cache_engine.pop()
+            cache_engine.destroy()
+        self.gpu_cache = None
+        torch.cuda.empty_cache()
+
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = [

@@ -73,6 +73,13 @@ def initialize_cache(self, num_gpu_blocks: int,
         """
         raise NotImplementedError
 
+    # TODO: Make this an abstract method and all workers should implement it.
+    # @abstractmethod
+    def destroy_cache(self) -> None:
+        """Clear out all the KV cache in the current worker.
+        """
+        raise NotImplementedError
+
     @current_platform.inference_mode()
     def start_worker_execution_loop(self) -> None:
         """Execute model loop in parallel worker.