diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 2459b4254..d75aa03b8 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -499,7 +499,7 @@ def main_export( cache_dir: Optional[str] = None, disable_neuron_cache: Optional[bool] = False, compiler_workdir: Optional[Union[str, Path]] = None, - inline_weights_to_neff: bool = False, + inline_weights_to_neff: bool = True, optlevel: str = "2", trust_remote_code: bool = False, subfolder: str = "", diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index a8103875e..a6e3a3c8c 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -465,7 +465,7 @@ def export_neuronx( config: "NeuronDefaultConfig", output: Path, compiler_workdir: Optional[Path] = None, - inline_weights_to_neff: bool = False, + inline_weights_to_neff: bool = True, optlevel: str = "2", auto_cast: Optional[str] = None, auto_cast_type: str = "bf16", @@ -482,7 +482,7 @@ def export_neuronx( Directory to store the exported Neuron model. compiler_workdir (`Optional[Path]`, defaults to `None`): The directory used by neuronx-cc, where you can find intermediary outputs (neff, weight, hlo...). - inline_weights_to_neff (`bool`, defaults to `False`): + inline_weights_to_neff (`bool`, defaults to `True`): Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff. optlevel (`str`, defaults to `"2"`): The level of optimization the compiler should perform. Can be `"1"`, `"2"` or `"3"`, defaults to "2". @@ -610,7 +610,7 @@ def export_neuron( config: "NeuronDefaultConfig", output: Path, compiler_workdir: Optional[Path] = None, - inline_weights_to_neff: bool = False, + inline_weights_to_neff: bool = True, auto_cast: Optional[str] = None, auto_cast_type: str = "bf16", disable_fast_relayout: bool = False, @@ -628,7 +628,7 @@ def export_neuron( Directory to store the exported Neuron model. compiler_workdir (`Optional[Path]`, defaults to `None`): The directory used by neuron-cc, where you can find intermediary outputs (neff, weight, hlo...). - inline_weights_to_neff (`bool`, defaults to `False`): + inline_weights_to_neff (`bool`, defaults to `True`): Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff. auto_cast (`Optional[str]`, defaults to `None`): Whether to cast operations from FP32 to lower precision to speed up the inference. Can be `None`, `"matmul"` or `"all"`, you should use `None` to disable any auto-casting, use `"matmul"` to cast FP32 matrix multiplication operations, and use `"all"` to cast all FP32 operations. diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py index acf380f70..6c52c6f5e 100644 --- a/optimum/neuron/modeling_base.py +++ b/optimum/neuron/modeling_base.py @@ -238,7 +238,7 @@ def _export( cache_dir: Optional[str] = None, compiler_workdir: Optional[Union[str, Path]] = None, disable_neuron_cache: bool = False, - inline_weights_to_neff: bool = False, + inline_weights_to_neff: bool = True, optlevel: str = "2", subfolder: str = "", local_files_only: bool = False, diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py index e86420c39..4257e25ba 100644 --- a/optimum/neuron/modeling_diffusion.py +++ b/optimum/neuron/modeling_diffusion.py @@ -577,7 +577,7 @@ def _export( cache_dir: Optional[str] = None, compiler_workdir: Optional[str] = None, disable_neuron_cache: bool = False, - inline_weights_to_neff: bool = False, + inline_weights_to_neff: bool = True, optlevel: str = "2", subfolder: str = "", local_files_only: bool = False, @@ -623,7 +623,7 @@ def _export( Path to a directory in which the neuron compiler will store all intermediary files during the compilation(neff, weight, hlo graph...). disable_neuron_cache (`bool`, defaults to `False`): Whether to disable automatic caching of compiled models. If set to True, will not load neuron cache nor cache the compiled artifacts. - inline_weights_to_neff (`bool`, defaults to `False`): + inline_weights_to_neff (`bool`, defaults to `True`): Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff. optlevel (`str`, defaults to `"2"`): The level of optimization the compiler should perform. Can be `"1"`, `"2"` or `"3"`, defaults to "2". diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py index 83a0bc5e1..9906397b8 100644 --- a/tests/cache/test_neuronx_cache.py +++ b/tests/cache/test_neuronx_cache.py @@ -92,6 +92,7 @@ def export_encoder_model(model_id): dynamic_batch_size=False, batch_size=batch_size, sequence_length=sequence_length, + inline_weights_to_neff=False, ) @@ -107,6 +108,7 @@ def export_stable_diffusion_model(model_id): height=height, width=width, num_images_per_prompt=num_images_per_prompt, + inline_weights_to_neff=False, ) @@ -122,6 +124,7 @@ def export_stable_diffusion_xl_model(model_id): height=height, width=width, num_images_per_prompt=num_images_per_prompt, + inline_weights_to_neff=False, )