From 327b498e68c8beb2b3274978ce00d9471adcb3e2 Mon Sep 17 00:00:00 2001
From: SunXiaoye <31361630+JingofXin@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:18:12 +0800
Subject: [PATCH] Unified Download Progress for Huggingface and Modelscope
 (#394)

---
 lazyllm/components/auto/autodeploy.py         |   2 +-
 lazyllm/components/auto/autofinetune.py       |   2 +-
 lazyllm/components/embedding/embed.py         |   4 +-
 .../components/speech_to_text/sense_voice.py  |   2 +-
 .../stable_diffusion/stable_diffusion3.py     |   2 +-
 lazyllm/components/text_to_speech/bark.py     |   2 +-
 lazyllm/components/text_to_speech/chattts.py  |   2 +-
 lazyllm/components/text_to_speech/musicgen.py |   2 +-
 .../utils/downloader/model_downloader.py      | 259 ++++++++++++++----
 lazyllm/module/module.py                      |   2 +-
 lazyllm/tools/train_service/serve.py          |   2 +-
 11 files changed, 224 insertions(+), 57 deletions(-)

diff --git a/lazyllm/components/auto/autodeploy.py b/lazyllm/components/auto/autodeploy.py
index bf3a557c..992b84c0 100644
--- a/lazyllm/components/auto/autodeploy.py
+++ b/lazyllm/components/auto/autodeploy.py
@@ -18,7 +18,7 @@ class AutoDeploy(LazyLLMDeployBase):
 
     def __new__(cls, base_model, source=lazyllm.config['model_source'], trust_remote_code=True, max_token_num=1024,
                 launcher=launchers.remote(ngpus=1), stream=False, type=None, **kw):
-        base_model = ModelManager(source).download(base_model)
+        base_model = ModelManager(source).download(base_model) or ''
         model_name = get_model_name(base_model)
         if not type:
             type = ModelManager.get_model_type(model_name)
diff --git a/lazyllm/components/auto/autofinetune.py b/lazyllm/components/auto/autofinetune.py
index a3ac8ef0..07e8842f 100644
--- a/lazyllm/components/auto/autofinetune.py
+++ b/lazyllm/components/auto/autofinetune.py
@@ -10,7 +10,7 @@
 class AutoFinetune(LazyLLMFinetuneBase):
     def __new__(cls, base_model, target_path, source=lazyllm.config['model_source'], merge_path=None, ctx_len=1024,
                 batch_size=32, lora_r=8, launcher=launchers.remote(ngpus=1), **kw):
-        base_model = ModelManager(source).download(base_model)
+        base_model = ModelManager(source).download(base_model) or ''
         model_name = get_model_name(base_model)
         model_type = ModelManager.get_model_type(model_name)
         if model_type in ['embed', 'tts', 'vlm', 'stt', 'sd']:
diff --git a/lazyllm/components/embedding/embed.py b/lazyllm/components/embedding/embed.py
index bf7600a7..f6e7798e 100644
--- a/lazyllm/components/embedding/embed.py
+++ b/lazyllm/components/embedding/embed.py
@@ -9,7 +9,7 @@ class LazyHuggingFaceEmbedding(object):
     def __init__(self, base_embed, source=None, init=False):
         from ..utils.downloader import ModelManager
         source = lazyllm.config['model_source'] if not source else source
-        self.base_embed = ModelManager(source).download(base_embed)
+        self.base_embed = ModelManager(source).download(base_embed) or ''
         self.embed = None
         self.tokenizer = None
         self.device = "cpu"
@@ -48,7 +48,7 @@ class LazyHuggingFaceRerank(object):
     def __init__(self, base_rerank, source=None, init=False):
         from ..utils.downloader import ModelManager
         source = lazyllm.config['model_source'] if not source else source
-        self.base_rerank = ModelManager(source).download(base_rerank)
+        self.base_rerank = ModelManager(source).download(base_rerank) or ''
         self.reranker = None
         self.init_flag = lazyllm.once_flag()
         if init:
diff --git a/lazyllm/components/speech_to_text/sense_voice.py b/lazyllm/components/speech_to_text/sense_voice.py
index 47fbfef7..2c741f58 100644
--- a/lazyllm/components/speech_to_text/sense_voice.py
+++ b/lazyllm/components/speech_to_text/sense_voice.py
@@ -20,7 +20,7 @@ def is_valid_path(path):
 class SenseVoice(object):
     def __init__(self, base_path, source=None, init=False):
         source = lazyllm.config['model_source'] if not source else source
-        self.base_path = ModelManager(source).download(base_path)
+        self.base_path = ModelManager(source).download(base_path) or ''
         self.model = None
         self.init_flag = lazyllm.once_flag()
         if init:
diff --git a/lazyllm/components/stable_diffusion/stable_diffusion3.py b/lazyllm/components/stable_diffusion/stable_diffusion3.py
index a68fb95a..fb2575cf 100644
--- a/lazyllm/components/stable_diffusion/stable_diffusion3.py
+++ b/lazyllm/components/stable_diffusion/stable_diffusion3.py
@@ -15,7 +15,7 @@
 class StableDiffusion3(object):
     def __init__(self, base_sd, source=None, embed_batch_size=30, trust_remote_code=True, save_path=None, init=False):
         source = lazyllm.config['model_source'] if not source else source
-        self.base_sd = ModelManager(source).download(base_sd)
+        self.base_sd = ModelManager(source).download(base_sd) or ''
         self.embed_batch_size = embed_batch_size
         self.trust_remote_code = trust_remote_code
         self.sd = None
diff --git a/lazyllm/components/text_to_speech/bark.py b/lazyllm/components/text_to_speech/bark.py
index 54591421..34b8a428 100644
--- a/lazyllm/components/text_to_speech/bark.py
+++ b/lazyllm/components/text_to_speech/bark.py
@@ -13,7 +13,7 @@ class Bark(object):
 
     def __init__(self, base_path, source=None, trust_remote_code=True, save_path=None, init=False):
         source = lazyllm.config['model_source'] if not source else source
-        self.base_path = ModelManager(source).download(base_path)
+        self.base_path = ModelManager(source).download(base_path) or ''
         self.trust_remote_code = trust_remote_code
         self.processor, self.bark = None, None
         self.init_flag = lazyllm.once_flag()
diff --git a/lazyllm/components/text_to_speech/chattts.py b/lazyllm/components/text_to_speech/chattts.py
index 15bf56a3..cf92edb4 100644
--- a/lazyllm/components/text_to_speech/chattts.py
+++ b/lazyllm/components/text_to_speech/chattts.py
@@ -12,7 +12,7 @@ class ChatTTSModule(object):
 
     def __init__(self, base_path, source=None, save_path=None, init=False):
         source = lazyllm.config['model_source'] if not source else source
-        self.base_path = ModelManager(source).download(base_path)
+        self.base_path = ModelManager(source).download(base_path) or ''
         self.model, self.spk = None, None
         self.init_flag = lazyllm.once_flag()
         self.device = 'cpu'
diff --git a/lazyllm/components/text_to_speech/musicgen.py b/lazyllm/components/text_to_speech/musicgen.py
index 0e10bfd2..9a355117 100644
--- a/lazyllm/components/text_to_speech/musicgen.py
+++ b/lazyllm/components/text_to_speech/musicgen.py
@@ -11,7 +11,7 @@ class MusicGen(object):
 
     def __init__(self, base_path, source=None, save_path=None, init=False):
         source = lazyllm.config['model_source'] if not source else source
-        self.base_path = ModelManager(source).download(base_path)
+        self.base_path = ModelManager(source).download(base_path) or ''
         self.model = None
         self.init_flag = lazyllm.once_flag()
         self.save_path = save_path or os.path.join(lazyllm.config['temp_dir'], 'musicgen')
diff --git a/lazyllm/components/utils/downloader/model_downloader.py b/lazyllm/components/utils/downloader/model_downloader.py
index 509ba90f..80db4707 100644
--- a/lazyllm/components/utils/downloader/model_downloader.py
+++ b/lazyllm/components/utils/downloader/model_downloader.py
@@ -1,5 +1,10 @@
 import os
+import time
 import shutil
+import functools
+import threading
+from abc import ABC, abstractmethod
+
 import lazyllm
 from .model_mapping import model_name_mapping, model_provider, model_groups
 from lazyllm.common.common import EnvVarContextManager
@@ -18,9 +23,16 @@ def __init__(self, model_source=lazyllm.config['model_source'],
                  cache_dir=lazyllm.config['model_cache_dir'],
                  model_path=lazyllm.config['model_path']):
         self.model_source = model_source
-        self.token = token
+        self.token = token or None
         self.cache_dir = cache_dir
         self.model_paths = model_path.split(":") if len(model_path) > 0 else []
+        if self.model_source == 'huggingface':
+            self.hub_downloader = HuggingfaceDownloader(token=self.token)
+        else:
+            self.hub_downloader = ModelscopeDownloader(token=self.token)
+            if self.model_source != 'modelscope':
+                lazyllm.LOG.warning("Only support Huggingface and Modelscope currently. "
+                                    f"Unsupported model source: {self.model_source}. Forcing use of Modelscope.")
 
     @classmethod
     def get_model_type(cls, model) -> str:
@@ -70,7 +82,7 @@ def _try_add_mapping(self, model):
                     "source": {k: v + '/' + model_base for k, v in model_provider[matched_model_prefix].items()}
                 }
 
-    def download(self, model=''):
+    def download(self, model='', call_back=None):
         assert isinstance(model, str), "model name should be a string."
         self._try_add_mapping(model)
         # Dummy or local model.
@@ -88,7 +100,7 @@ def download(self, model=''):
             full_model_dir = os.path.join(self.cache_dir, model)
 
             mapped_model_name = model_name_mapping[model.lower()]['source'][self.model_source]
-            model_save_dir = self._do_download(mapped_model_name)
+            model_save_dir = self._do_download(mapped_model_name, call_back)
             if model_save_dir:
                 # The code safely creates a symbolic link by removing any existing target.
                 if os.path.exists(full_model_dir):
@@ -97,17 +109,24 @@ def download(self, model=''):
                     os.unlink(full_model_dir)
                 os.symlink(model_save_dir, full_model_dir, target_is_directory=True)
                 return full_model_dir
-            return model  # failed to download model, keep model as it is
+            return model_save_dir  # return False
         else:
             model_name_for_download = model
 
-            # Try to figure out a possible model provider
-            matched_model_prefix = next((key for key in model_provider if model.lower().startswith(key)), None)
-            if matched_model_prefix and self.model_source in model_provider[matched_model_prefix]:
-                model_name_for_download = model_provider[matched_model_prefix][self.model_source] + '/' + model
+            if '/' not in model_name_for_download:
+                # Try to figure out a possible model provider
+                matched_model_prefix = next((key for key in model_provider if model.lower().startswith(key)), None)
+                if matched_model_prefix and self.model_source in model_provider[matched_model_prefix]:
+                    model_name_for_download = model_provider[matched_model_prefix][self.model_source] + '/' + model
+
+            model_save_dir = self._do_download(model_name_for_download, call_back)
+            return model_save_dir
 
-            model_save_dir = self._do_download(model_name_for_download)
-            return model_save_dir if model_save_dir else model
+    def validate_token(self):
+        return self.hub_downloader.verify_hub_token()
+
+    def validate_model_id(self, model_id):
+        return self.hub_downloader.verify_model_id(model_id)
 
     def _model_exists_at_path(self, model_name):
         if len(self.model_paths) == 0:
@@ -137,18 +156,15 @@ def _is_model_valid(self, model_dir):
             return False
         return any((True for _ in os.scandir(model_dir)))
 
-    def _do_download(self, model=''):
+    def _do_download(self, model='', call_back=None):
         model_dir = model.replace('/', os.sep)
         full_model_dir = os.path.join(self.cache_dir, self.model_source, model_dir)
 
         try:
-            if self.model_source == 'huggingface':
-                return self._download_model_from_hf(model, full_model_dir)
-            elif self.model_source == 'modelscope':
-                return self._download_model_from_ms(model, full_model_dir)
+            return self.hub_downloader.download(model, full_model_dir, call_back)
         # Use `BaseException` to capture `KeyboardInterrupt` and normal `Exceptioin`.
         except BaseException as e:
-            lazyllm.LOG.warning(f"Huggingface: {e}")
+            lazyllm.LOG.warning(f"Download encountered an error: {e}")
             if not self.token:
                 lazyllm.LOG.warning('Token is empty, which may prevent private models from being downloaded, '
                                     'as indicated by "the model does not exist." Please set the token with the '
@@ -156,39 +172,190 @@ def _do_download(self, model=''):
             if os.path.isdir(full_model_dir):
                 shutil.rmtree(full_model_dir)
                 lazyllm.LOG.warning(f"{full_model_dir} removed due to exceptions.")
-        return model
+        return False
+
+class HubDownloader(ABC):
+
+    def __init__(self, token=None):
+        self._token = token if self._verify_hub_token(token) else None
+        self._api = self._build_hub_api(self._token)
+
+    @abstractmethod
+    def _verify_hub_token(self, token):
+        pass
+
+    @abstractmethod
+    def _build_hub_api(self, token):
+        pass
+
+    @abstractmethod
+    def verify_model_id(self, model_id):
+        pass
+
+    @abstractmethod
+    def _do_download(self, model_id, model_dir):
+        pass
+
+    @abstractmethod
+    def _get_repo_files(self, model_id):
+        pass
+
+    def _polling_progress(self, model_dir, total, polling_event, call_back):
+        while not polling_event.is_set():
+            n = self._get_current_files_size(model_dir)
+            n = min(n, total)
+            if callable(call_back):
+                try:
+                    call_back(n, total)
+                except Exception as e:
+                    print(f"Error in callback: {e}")
+            time.sleep(1)
+
+    def _get_current_files_size(self, model_dir):
+        total_size = 0
+        for dirpath, _, filenames in os.walk(model_dir):
+            for f in filenames:
+                fp = os.path.join(dirpath, f)
+                if os.path.isfile(fp):
+                    total_size += os.path.getsize(fp)
+        return total_size
+
+    def _get_files_total_size(self, hub_model_info):
+        size = 0
+        for item in hub_model_info:
+            size += item['Size']
+        return size
+
+    def download(self, model_id, model_dir, call_back=None):
+        total = self._get_files_total_size(self._get_repo_files(model_id))
+        if call_back:
+            polling_event = threading.Event()
+            polling_thread = threading.Thread(target=self._polling_progress,
+                                              args=(model_dir, total, polling_event, call_back))
+            polling_thread.daemon = True
+            polling_thread.start()
+        downloaded_path = self._do_download(model_id, model_dir)
+        if call_back and polling_thread:
+            polling_event.set()
+            polling_thread.join()
+        return downloaded_path
+
+    def verify_hub_token(self):
+        return True if self._token else False
+
+class HuggingfaceDownloader(HubDownloader):
+
+    def _envs_manager(func):
+
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            env_vars = {'https_proxy': lazyllm.config['https_proxy'] or os.environ.get("https_proxy", None),
+                        'http_proxy': lazyllm.config['http_proxy'] or os.environ.get("http_proxy", None)}
+            with EnvVarContextManager(env_vars):
+                if not os.environ.get("https_proxy", None):
+                    lazyllm.LOG.warning('If there is no download response or if downloads repeatedly fail over an '
+                                        'extended period, please set the `LAZYLLM_HTTPS_PROXY` environment variable '
+                                        'to configure a proxy. Do not directly set the `https_proxy` and `http_proxy` '
+                                        'environment variables in your environment, as doing so may disrupt model '
+                                        'deployment and result in deployment failures.')
+                return func(self, *args, **kwargs)
+        return wrapper
+
+    def _build_hub_api(self, token):
+        from huggingface_hub import HfApi
+        return HfApi(token=token)
 
-    def _download_model_from_hf(self, model_name='', model_dir=''):
+    @_envs_manager
+    def _verify_hub_token(self, token):
+        from huggingface_hub import HfApi
+        api = HfApi()
+        try:
+            api.whoami(token)
+            return True
+        except Exception as e:
+            lazyllm.LOG.warning('Verify failed: ', e)
+            return False
+
+    @_envs_manager
+    def verify_model_id(self, model_id):
+        try:
+            self._api.model_info(model_id)
+            return True
+        except Exception as e:
+            lazyllm.LOG.warning('Verify failed: ', e)
+            return False
+
+    @_envs_manager
+    def _do_download(self, model_id, model_dir):
         from huggingface_hub import snapshot_download
         # refer to https://huggingface.co/docs/huggingface_hub/v0.23.1/en/package_reference/file_download
-        if self.token == '':
-            self.token = None
-        elif self.token.lower() == 'true':
-            self.token = True
-        # else token would be a string from the user.
-        env_vars = {'https_proxy': lazyllm.config['https_proxy'] or os.environ.get("https_proxy", None),
-                    'http_proxy': lazyllm.config['http_proxy'] or os.environ.get("http_proxy", None)}
-        with EnvVarContextManager(env_vars):
-            if not os.environ.get("https_proxy", None):
-                lazyllm.LOG.warning('If there is no download response or if downloads repeatedly fail over an '
-                                    'extended period, please set the `LAZYLLM_HTTPS_PROXY` environment variable '
-                                    'to configure a proxy. Do not directly set the `https_proxy` and `http_proxy` '
-                                    'environment variables in your environment, as doing so may disrupt model '
-                                    'deployment and result in deployment failures.')
-            model_dir_result = snapshot_download(repo_id=model_name, local_dir=model_dir, token=self.token)
-
-        lazyllm.LOG.info(f"model downloaded at {model_dir_result}")
-        return model_dir_result
-
-    def _download_model_from_ms(self, model_name='', model_dir=''):
+        if not self.verify_model_id(model_id):
+            lazyllm.LOG.warning(f"Invalid model id:{model_id}")
+            return False
+        downloaded_path = snapshot_download(repo_id=model_id, local_dir=model_dir, token=self._token)
+        lazyllm.LOG.info(f"model downloaded at {downloaded_path}")
+        return downloaded_path
+
+    @_envs_manager
+    def _get_repo_files(self, model_id):
+        assert self._api
+        orgin_info = self._api.list_repo_tree(model_id, expand=True, recursive=True)
+        hub_model_info = []
+        for item in list(orgin_info):
+            if hasattr(item, 'size'):
+                hub_model_info.append({
+                    'Path': item.path,
+                    'Size': item.size,
+                    'SHA': item.blob_id,
+                })
+        return hub_model_info
+
+class ModelscopeDownloader(HubDownloader):
+
+    def _build_hub_api(self, token):
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        if token:
+            api.login(token)
+        return api
+
+    def _verify_hub_token(self, token):
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        try:
+            api.login(token)
+            return True
+        except Exception as e:
+            lazyllm.LOG.warning('Verify failed: ', e)
+            return False
+
+    def verify_model_id(self, model_id):
+        try:
+            self._api.get_model(model_id)
+            return True
+        except Exception as e:
+            lazyllm.LOG.warning('Verify failed: ', e)
+            return False
+
+    def _do_download(self, model_id, model_dir):
         from modelscope.hub.snapshot_download import snapshot_download
         # refer to https://www.modelscope.cn/docs/models/download
+        if not self.verify_model_id(model_id):
+            lazyllm.LOG.warning(f"Invalid model id:{model_id}")
+            return False
+        downloaded_path = snapshot_download(model_id=model_id, local_dir=model_dir)
+        lazyllm.LOG.info(f"Model downloaded at {downloaded_path}")
+        return downloaded_path
 
-        if (len(self.token) > 0):
-            from modelscope.hub.api import HubApi
-            api = HubApi()
-            api.login(self.token)
-        model_dir_result = snapshot_download(model_id=model_name, local_dir=model_dir)
-
-        lazyllm.LOG.info(f"Model downloaded at {model_dir_result}")
-        return model_dir_result
+    def _get_repo_files(self, model_id):
+        assert self._api
+        orgin_info = self._api.get_model_files(model_id, recursive=True)
+        hub_model_info = []
+        for item in orgin_info:
+            if item['Type'] == 'blob':
+                hub_model_info.append({
+                    'Path': item['Path'],
+                    'Size': item['Size'],
+                    'SHA': item['Sha256']
+                })
+        return hub_model_info
diff --git a/lazyllm/module/module.py b/lazyllm/module/module.py
index f4c8d323..3f1a7a27 100644
--- a/lazyllm/module/module.py
+++ b/lazyllm/module/module.py
@@ -603,7 +603,7 @@ def __init__(self, base_model='', target_path='', stream=False, train=None, fine
         super().__init__()
         # TODO(wangzhihong): Update ModelDownloader to support async download, and move it to deploy.
         #                    Then support Option for base_model
-        self._base_model = ModelManager(lazyllm.config['model_source']).download(base_model)
+        self._base_model = ModelManager(lazyllm.config['model_source']).download(base_model) or ''
         self._target_path = os.path.join(lazyllm.config['train_target_root'], target_path)
         self._stream = stream
         self._father = []
diff --git a/lazyllm/tools/train_service/serve.py b/lazyllm/tools/train_service/serve.py
index c2620e6c..c955f8fe 100644
--- a/lazyllm/tools/train_service/serve.py
+++ b/lazyllm/tools/train_service/serve.py
@@ -343,7 +343,7 @@ async def list_jobs(self, token: str = Header(None)):
             self._update_user_job_training_info(token)
         save_root = os.path.join(lazyllm.config['train_target_root'], token)
         server_running_dict = self._read_user_job_training_info(token)
-        m = lazyllm.TrainableModule('dummpy', save_root)
+        m = lazyllm.TrainableModule('', save_root)
         valid_models, invalid_models = m.get_all_models()
         for model_id, model_path in valid_models:
             job_id = model_path[len(save_root):].lstrip(os.sep).split(os.sep)[0]