Skip to content

Commit

Permalink
Merge branch 'main' into ea/qwen25vl
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova authored Feb 17, 2025
2 parents 107d7ef + 8c94f53 commit 89b7a8c
Show file tree
Hide file tree
Showing 11 changed files with 709 additions and 546 deletions.
926 changes: 420 additions & 506 deletions notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb

Large diffs are not rendered by default.

9 changes: 7 additions & 2 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from requests.exceptions import ConnectionError as RequestsConnectionError
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin
from transformers.utils import is_torch_available

from openvino.runtime import Core, Type, save_model
Expand Down Expand Up @@ -531,10 +531,15 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro

if is_openvino_tokenizers_available():
if library_name != "diffusers" and preprocessors:
processor_chat_template = None
tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
if len(preprocessors) > 1:
for processor in preprocessors:
if isinstance(processor, ProcessorMixin) and hasattr(processor, "chat_template"):
processor_chat_template = processor.chat_template
if tokenizer:
try:
export_tokenizer(tokenizer, output, task=task)
export_tokenizer(tokenizer, output, task=task, processor_chat_template=processor_chat_template)
except Exception as exception:
logger.warning(
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
Expand Down
6 changes: 5 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
remove_none_from_dummy_inputs,
save_config,
save_preprocessors,
set_simplified_chat_template,
)


Expand Down Expand Up @@ -825,6 +826,7 @@ def export_tokenizer(
output: Union[str, Path],
suffix: Optional[str] = "",
task: Optional[str] = None,
processor_chat_template: Optional[str] = None,
):
# avoid circular imports
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
Expand All @@ -849,7 +851,7 @@ def export_tokenizer(

if (
task is not None
and task.startswith("text-generation")
and (task.startswith("text-generation") or task == "image-text-to-text")
and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0")
):
logger.info(f"Set tokenizer padding side to left for `{task}` task.")
Expand All @@ -858,6 +860,8 @@ def export_tokenizer(

try:
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
set_simplified_chat_template(converted[0], processor_chat_template)

except NotImplementedError:
logger.info("Detokenizer is not supported, convert tokenizer only.")
converted = convert_tokenizer(tokenizer, with_detokenizer=False)
Expand Down
33 changes: 33 additions & 0 deletions optimum/exporters/openvino/utils.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,14 @@ class OVQuantizationMethod(str, Enum):
"quant_method": OVQuantizationMethod.AWQ,
"scale_estimation": True,
},
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
"bits": 4,
"sym": False,
"group_size": 64,
"ratio": 0.8,
"dataset": "wikitext2",
"quant_method": OVQuantizationMethod.AWQ,
},
}

_DEFAULT_4BIT_CONFIG = {
Expand Down
50 changes: 25 additions & 25 deletions optimum/intel/openvino/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,9 @@ def forward(

np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids

inputs = {
"input_ids": input_ids,
Expand Down Expand Up @@ -239,9 +239,9 @@ def forward(

np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids

inputs = {
"input_ids": input_ids,
Expand Down Expand Up @@ -308,9 +308,9 @@ def forward(

np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids

inputs = {
"input_ids": input_ids,
Expand Down Expand Up @@ -379,9 +379,9 @@ def forward(

np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids

inputs = {
"input_ids": input_ids,
Expand Down Expand Up @@ -448,9 +448,9 @@ def forward(

np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids

inputs = {
"input_ids": input_ids,
Expand Down Expand Up @@ -581,7 +581,7 @@ def forward(

np_inputs = isinstance(pixel_values, np.ndarray)
if not np_inputs:
pixel_values = np.array(pixel_values)
pixel_values = pixel_values.cpu().numpy()

inputs = {
"pixel_values": pixel_values,
Expand Down Expand Up @@ -640,8 +640,8 @@ def forward(

np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = np.array(input_values)
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask

inputs = {
"input_values": input_values,
Expand Down Expand Up @@ -711,8 +711,8 @@ def forward(
):
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = np.array(input_values)
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask

inputs = {
"input_values": input_values,
Expand Down Expand Up @@ -791,8 +791,8 @@ def forward(
):
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = np.array(input_values)
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask

inputs = {
"input_values": input_values,
Expand Down Expand Up @@ -867,8 +867,8 @@ def forward(
):
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = np.array(input_values)
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask

inputs = {
"input_values": input_values,
Expand Down Expand Up @@ -929,7 +929,7 @@ def forward(self, **kwargs):
np_inputs = isinstance(next(iter(kwargs.values())), np.ndarray)
inputs = {}
for input_name in self.input_names:
inputs[input_name] = np.array(kwargs.pop(input_name)) if not np_inputs else kwargs.pop(input_name)
inputs[input_name] = kwargs.pop(input_name).cpu().numpy() if not np_inputs else kwargs.pop(input_name)

outputs = self._inference(inputs)
model_outputs = {}
Expand Down
8 changes: 4 additions & 4 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import openvino
import torch
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from openvino import Core, Tensor, Type
from openvino.preprocess import PrePostProcessor
from openvino.runtime import Core, Tensor, Type
from transformers import AutoModelForCausalLM, PretrainedConfig
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
from transformers.generation import GenerationMixin
Expand Down Expand Up @@ -492,11 +492,11 @@ def prepare_inputs(
self.next_beam_idx = np.arange(batch_size, dtype=int)
self._past_length = 0
past_len = self._get_past_length(past_key_values)
inputs["input_ids"] = np.array(input_ids)
inputs["input_ids"] = input_ids.cpu().numpy()
# Add the attention_mask inputs when needed
if "attention_mask" in self.input_names or "position_ids" in self.input_names:
if attention_mask is not None:
attention_mask = np.array(attention_mask)
attention_mask = attention_mask.cpu().numpy()
else:
attention_mask = np.ones(
(input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype
Expand All @@ -507,7 +507,7 @@ def prepare_inputs(

if "position_ids" in self.input_names:
if position_ids is not None:
position_ids = np.array(position_ids)
position_ids = position_ids.cpu().numpy()
else:
position_ids = np.cumsum(attention_mask, axis=1) - 1
position_ids[attention_mask == 0] = 1
Expand Down
6 changes: 3 additions & 3 deletions optimum/intel/openvino/modeling_sentence_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def forward(self, inputs: Dict[str, torch.Tensor]):

np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids

inputs = {
"input_ids": input_ids,
Expand Down
4 changes: 2 additions & 2 deletions optimum/intel/openvino/modeling_visual_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def prepare_inputs(
# Add the attention_mask inputs when needed
if "attention_mask" in self.input_names or "position_ids" in self.input_names:
if attention_mask is not None:
attention_mask = np.array(attention_mask)
attention_mask = attention_mask.cpu().numpy()
else:
attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int)

Expand All @@ -159,7 +159,7 @@ def prepare_inputs(

if "position_ids" in self.input_names:
if position_ids is not None:
position_ids = np.array(position_ids)
position_ids = position_ids.cpu().numpy()
else:
position_ids = np.cumsum(attention_mask, axis=1) - 1
position_ids[attention_mask == 0] = 1
Expand Down
Loading

0 comments on commit 89b7a8c

Please sign in to comment.