Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tokenizer] Refactor djl_convert python code #3179

Merged
merged 1 commit into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions extensions/tokenizers/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Cargo.lock
/tokenizers
/jnilib
model/
tmp/
models.json
4 changes: 1 addition & 3 deletions extensions/tokenizers/src/main/python/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
__pycache__
model/
tmp/
models.json
*.egg-info/
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import torch

from huggingface_converter import HuggingfaceConverter
from djl_converter.huggingface_converter import HuggingfaceConverter


class FillMaskConverter(HuggingfaceConverter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
from argparse import Namespace

import onnx
import safetensors_convert
from djl_converter.safetensors_convert import convert_file
import torch
from huggingface_hub import hf_hub_download, HfApi
from transformers import pipeline, AutoTokenizer, AutoConfig

from metadata import HuggingfaceMetadata
from shasum import sha1_sum
from zip_utils import zip_dir
from djl_converter.metadata import HuggingfaceMetadata
from djl_converter.shasum import sha1_sum
from djl_converter.zip_utils import zip_dir


class PipelineHolder(object):
Expand Down Expand Up @@ -139,7 +139,7 @@ def save_rust_model(self, model_info, args: Namespace, temp_dir: str):
elif has_pt_file:
file = hf_hub_download(repo_id=model_id,
filename="pytorch_model.bin")
safetensors_convert.convert_file(file, target)
convert_file(file, target)
else:
return False, f"No model file found for: {model_id}", -1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
from huggingface_hub import HfApi
from huggingface_hub import hf_hub_download
from huggingface_hub.hf_api import ModelInfo
from djl_converter.fill_mask_converter import FillMaskConverter
from djl_converter.metadata import get_lang_tags
from djl_converter.question_answering_converter import QuestionAnsweringConverter
from djl_converter.sentence_similarity_converter import SentenceSimilarityConverter
from djl_converter.text_classification_converter import TextClassificationConverter
from djl_converter.token_classification_converter import TokenClassificationConverter

ARCHITECTURES_2_TASK = {
"ForQuestionAnswering": "question-answering",
Expand All @@ -27,19 +33,13 @@
"ForMultipleChoice": "text-classification",
"ForMaskedLM": "fill-mask",
}
LANGUAGES = HfApi().get_model_tags()["language"]


def get_lang_tags(model_info):
tags = {}
for tag in model_info.tags:
if tag in LANGUAGES:
tags[tag] = "true"

if not tags:
tags["en"] = "true"

return tags
SUPPORTED_TASKS = {
"fill-mask": FillMaskConverter(),
"question-answering": QuestionAnsweringConverter(),
"sentence-similarity": SentenceSimilarityConverter(),
"text-classification": TextClassificationConverter(),
"token-classification": TokenClassificationConverter(),
}


class HuggingfaceModels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,22 @@
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
import json
from huggingface_models import get_lang_tags

from huggingface_hub import HfApi

LANGUAGES = HfApi().get_model_tags()["language"]


def get_lang_tags(model_info):
tags = {}
for tag in model_info.tags:
if tag in LANGUAGES:
tags[tag] = "true"

if not tags:
tags["en"] = "true"

return tags


class HuggingfaceMetadata:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,13 @@
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
import logging
import os.path
import os
import shutil
import sys

from arg_parser import converter_args
from fill_mask_converter import FillMaskConverter
from huggingface_models import HuggingfaceModels
from question_answering_converter import QuestionAnsweringConverter
from sentence_similarity_converter import SentenceSimilarityConverter
from text_classification_converter import TextClassificationConverter
from token_classification_converter import TokenClassificationConverter
sys.path.append(os.path.dirname(os.path.realpath(__file__)))

SUPPORTED_TASK = {
"fill-mask": FillMaskConverter(),
"question-answering": QuestionAnsweringConverter(),
"sentence-similarity": SentenceSimilarityConverter(),
"text-classification": TextClassificationConverter(),
"token-classification": TokenClassificationConverter(),
}
from djl_converter.arg_parser import converter_args


def main():
Expand All @@ -38,6 +26,8 @@ def main():
level=logging.INFO)
args = converter_args()

from djl_converter.huggingface_models import HuggingfaceModels, SUPPORTED_TASKS

huggingface_models = HuggingfaceModels(args.output_dir)
temp_dir = f"{args.output_dir}/tmp"

Expand All @@ -48,7 +38,7 @@ def main():
for model in models:
task = model["task"]
model_info = model["model_info"]
converter = SUPPORTED_TASK[task]
converter = SUPPORTED_TASKS[task]

try:
result, reason, size = converter.save_model(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import torch

from huggingface_converter import HuggingfaceConverter
from djl_converter.huggingface_converter import HuggingfaceConverter


class QuestionAnsweringConverter(HuggingfaceConverter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig

from huggingface_converter import HuggingfaceConverter, PipelineHolder
from djl_converter.huggingface_converter import HuggingfaceConverter, PipelineHolder
from huggingface_hub import hf_hub_download


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import torch

from huggingface_converter import HuggingfaceConverter
from djl_converter.huggingface_converter import HuggingfaceConverter


class TextClassificationConverter(HuggingfaceConverter):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import torch

from huggingface_converter import HuggingfaceConverter
from djl_converter.huggingface_converter import HuggingfaceConverter


class TokenClassificationConverter(HuggingfaceConverter):
Expand Down
Loading