Skip to content

Commit

Permalink
[tokenizer] Handles import huggingface model zoo exception case (#2872)
Browse files Browse the repository at this point in the history
  • Loading branch information
frankfliu committed Apr 26, 2024
1 parent dcfeb01 commit 29c227a
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 11 deletions.
19 changes: 15 additions & 4 deletions extensions/tokenizers/src/main/python/huggingface_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,20 @@ def save_model(self, model_info, args: Namespace, temp_dir: str):
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)

hf_pipeline = self.load_model(model_id)
# Save tokenizer.json to temp dir
self.save_tokenizer(hf_pipeline, temp_dir)
try:
hf_pipeline = self.load_model(model_id)
except Exception as e:
logging.warning(f"Failed to load model: {model_id}.")
logging.warning(e, exc_info=True)
return False, "Failed to load model", -1

try:
# Save tokenizer.json to temp dir
self.save_tokenizer(hf_pipeline, temp_dir)
except Exception as e:
logging.warning(f"Failed to save tokenizer: {model_id}.")
logging.warning(e, exc_info=True)
return False, "Failed to save tokenizer", -1

# Save config.json just for reference
config = hf_hub_download(repo_id=model_id, filename="config.json")
Expand Down Expand Up @@ -112,7 +123,7 @@ def jit_trace_model(self, hf_pipeline, model_id: str, temp_dir: str,
logging.info(f"Saving torchscript model: {model_name}.pt ...")
model_file = os.path.join(temp_dir, f"{model_name}.pt")
script_module.save(model_file)
except (RuntimeError, ValueError) as e:
except Exception as e:
logging.warning(f"Failed to trace model: {model_id}.")
logging.warning(e, exc_info=True)
return None
Expand Down
20 changes: 16 additions & 4 deletions extensions/tokenizers/src/main/python/huggingface_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,29 @@ def __init__(self, output_dir: str):
self.temp_dir = f"{self.output_dir}/tmp"

def list_models(self, args: Namespace) -> List[dict]:
import_all = os.environ.get("HF_IMPORT_ALL")

api = HfApi()
if args.model_name:
models = api.list_models(filter="pytorch",
search=args.model_name,
sort="downloads",
direction=-1,
limit=args.limit)
if not models:
logging.warning(f"no model found: {args.model_name}.")
import_all = True
else:
models = api.list_models(filter=f"{args.category},pytorch",
sort="downloads",
direction=-1,
limit=args.limit)
if not models:
if not models:
if args.model_name:
logging.warning(f"no model found: {args.model_name}.")
else:
logging.warning(f"no model matches category: {args.category}.")

return []

ret = []
for model_info in models:
model_id = model_info.modelId
Expand All @@ -83,7 +89,7 @@ def list_models(self, args: Namespace) -> List[dict]:
continue

languages = get_lang_tags(model_info)
if "en" not in languages and not os.environ["HF_IMPORT_ALL"]:
if "en" not in languages and not import_all:
logging.warning(f"Skip non-English model: {model_id}.")
continue

Expand All @@ -94,6 +100,12 @@ def list_models(self, args: Namespace) -> List[dict]:
logging.info(f"Skip converted model: {model_id}.")
continue

if model_info.downloads < 50 and not import_all:
logging.info(
f"Skip model {model_info.modelId}, downloads {model_info.downloads} < 50"
)
continue

try:
config = hf_hub_download(repo_id=model_id,
filename="config.json")
Expand Down
14 changes: 11 additions & 3 deletions extensions/tokenizers/src/main/python/model_zoo_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,17 @@ def main():
model_info = model["model_info"]
converter = SUPPORTED_TASK[task]

result, reason, size = converter.save_model(model_info, args, temp_dir)
if not result:
logging.error(f"{model_info.modelId}: {reason}")
try:
result, reason, size = converter.save_model(
model_info, args, temp_dir)
if not result:
logging.error(f"{model_info.modelId}: {reason}")
except Exception as e:
logging.warning(f"Failed to convert model: {model_info.modelId}.")
logging.warning(e, exc_info=True)
result = False
reason = "Failed to convert model"
size = -1

huggingface_models.update_progress(model_info, converter.application,
result, reason, size, args.cpu_only)
Expand Down

0 comments on commit 29c227a

Please sign in to comment.