From ae8ec91786e32fdcf5f27a44da63daf776df0c14 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:02:27 -0500 Subject: [PATCH 1/7] Add wrapped for hf_hub_download --- unstructured_inference/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index ae4e1495..4ed9d7e6 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -7,6 +7,7 @@ import cv2 import numpy as np from PIL import Image +from huggingface_hub import hf_hub_download from unstructured_inference.constants import AnnotationResult from unstructured_inference.inference.layoutelement import LayoutElement @@ -182,3 +183,12 @@ def strip_tags(html: str) -> str: s = MLStripper() s.feed(html) return s.get_data() + + +def download_if_needed_and_get_local_path(path_or_repo: str, filename: str, **kwargs) -> str: + """Returns path to local file if it exists, otherwise treats it as a huggingface repo and + attempts to download.""" + if os.path.exists(path_or_repo): + return path_or_repo + else: + return hf_hub_download(path_or_repo, filename, **kwargs) From 7bbbcd1d1f99083a32fc9c55ec1c953a29872e2a Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:02:45 -0500 Subject: [PATCH 2/7] Replace hf_hub_download with wrapper --- unstructured_inference/models/chipper.py | 5 ++--- unstructured_inference/models/detectron2.py | 15 +++++++++------ unstructured_inference/models/detectron2onnx.py | 11 +++++++---- unstructured_inference/models/yolox.py | 13 ++++++++----- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py index 9128897c..06aa64e7 100644 --- a/unstructured_inference/models/chipper.py +++ b/unstructured_inference/models/chipper.py @@ -9,7 +9,6 @@ import torch import transformers from cv2.typing import MatLike -from huggingface_hub import hf_hub_download from PIL.Image import Image from transformers import DonutProcessor, VisionEncoderDecoderModel from transformers.generation.logits_process import LogitsProcessor @@ -22,7 +21,7 @@ from unstructured_inference.models.unstructuredmodel import ( UnstructuredElementExtractionModel, ) -from unstructured_inference.utils import LazyDict, strip_tags +from unstructured_inference.utils import LazyDict, strip_tags, download_if_needed_and_get_local_path MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = { "chipperv1": { @@ -115,7 +114,7 @@ def initialize( token=auth_token, ) if swap_head: - lm_head_file = hf_hub_download( + lm_head_file = download_if_needed_and_get_local_path( repo_id=pre_trained_model_repo, filename="lm_head.pth", token=auth_token, diff --git a/unstructured_inference/models/detectron2.py b/unstructured_inference/models/detectron2.py index c38f6848..520298ce 100644 --- a/unstructured_inference/models/detectron2.py +++ b/unstructured_inference/models/detectron2.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import Any, Dict, Final, List, Optional, Union -from huggingface_hub import hf_hub_download from layoutparser.models.detectron2.layoutmodel import ( Detectron2LayoutModel, is_detectron2_available, @@ -17,7 +16,11 @@ from unstructured_inference.models.unstructuredmodel import ( UnstructuredObjectDetectionModel, ) -from unstructured_inference.utils import LazyDict, LazyEvaluateInfo +from unstructured_inference.utils import ( + LazyDict, + LazyEvaluateInfo, + download_if_needed_and_get_local_path, +) DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config" DEFAULT_LABEL_MAP: Final[Dict[int, str]] = { @@ -35,12 +38,12 @@ MODEL_TYPES = { "detectron2_lp": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "layoutparser/detectron2", "PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth", ), config_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "layoutparser/detectron2", "PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml", ), @@ -49,12 +52,12 @@ ), "checkbox": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/oer-checkbox", "detectron2_finetuned_oer_checkbox.pth", ), config_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/oer-checkbox", "detectron2_oer_checkbox.json", ), diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py index 3def8ced..79cd0a1a 100644 --- a/unstructured_inference/models/detectron2onnx.py +++ b/unstructured_inference/models/detectron2onnx.py @@ -4,7 +4,6 @@ import cv2 import numpy as np import onnxruntime -from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from onnxruntime.capi import _pybind_state as C from onnxruntime.quantization import QuantType, quantize_dynamic @@ -16,7 +15,11 @@ from unstructured_inference.models.unstructuredmodel import ( UnstructuredObjectDetectionModel, ) -from unstructured_inference.utils import LazyDict, LazyEvaluateInfo +from unstructured_inference.utils import ( + LazyDict, + LazyEvaluateInfo, + download_if_needed_and_get_local_path, +) onnxruntime.set_default_logger_severity(logger_onnx.getEffectiveLevel()) @@ -34,7 +37,7 @@ MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = { "detectron2_onnx": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/detectron2_faster_rcnn_R_50_FPN_3x", "model.onnx", ), @@ -52,7 +55,7 @@ }, "detectron2_mask_rcnn": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/detectron2_mask_rcnn_X_101_32x8d_FPN_3x", "model.onnx", ), diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 852e15b3..0acd93f3 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -8,14 +8,17 @@ import cv2 import numpy as np import onnxruntime -from huggingface_hub import hf_hub_download from onnxruntime.capi import _pybind_state as C from PIL import Image as PILImage from unstructured_inference.constants import ElementType, Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel -from unstructured_inference.utils import LazyDict, LazyEvaluateInfo +from unstructured_inference.utils import ( + LazyDict, + LazyEvaluateInfo, + download_if_needed_and_get_local_path, +) YOLOX_LABEL_MAP = { 0: ElementType.CAPTION, @@ -34,7 +37,7 @@ MODEL_TYPES = { "yolox": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/yolo_x_layout", "yolox_l0.05.onnx", ), @@ -42,7 +45,7 @@ ), "yolox_tiny": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/yolo_x_layout", "yolox_tiny.onnx", ), @@ -50,7 +53,7 @@ ), "yolox_quantized": LazyDict( model_path=LazyEvaluateInfo( - hf_hub_download, + download_if_needed_and_get_local_path, "unstructuredio/yolo_x_layout", "yolox_l0.05_quantized.onnx", ), From f1ce434404401c9b5eeb7bf1feb45fbb31fb2cc7 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:09:56 -0500 Subject: [PATCH 3/7] join to get full path --- unstructured_inference/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index 4ed9d7e6..447ae047 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -188,7 +188,8 @@ def strip_tags(html: str) -> str: def download_if_needed_and_get_local_path(path_or_repo: str, filename: str, **kwargs) -> str: """Returns path to local file if it exists, otherwise treats it as a huggingface repo and attempts to download.""" - if os.path.exists(path_or_repo): - return path_or_repo + full_path = os.path.join(path_or_repo, filename) + if os.path.exists(full_path): + return full_path else: return hf_hub_download(path_or_repo, filename, **kwargs) From 82212987218f3b3b0abc9ac2c22ddbad1385b9c2 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:14:07 -0500 Subject: [PATCH 4/7] Fix path_or_repo --- unstructured_inference/models/chipper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py index 06aa64e7..bdb4ea85 100644 --- a/unstructured_inference/models/chipper.py +++ b/unstructured_inference/models/chipper.py @@ -115,7 +115,7 @@ def initialize( ) if swap_head: lm_head_file = download_if_needed_and_get_local_path( - repo_id=pre_trained_model_repo, + path_or_repo=pre_trained_model_repo, filename="lm_head.pth", token=auth_token, ) From 1c9ea9207b3ae823e4b68ab9bd98a51be709775f Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:14:19 -0500 Subject: [PATCH 5/7] update changelog --- CHANGELOG.md | 3 ++- unstructured_inference/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4a9586e..36ef661d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.7.31-dev0 +## 0.7.31-dev1 * refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text +* enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF ## 0.7.30 diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 839644a9..974660ab 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.31-dev0" # pragma: no cover +__version__ = "0.7.31-dev1" # pragma: no cover From 6fc5620324276725967ed03e2fe3396cfb1a7eaf Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:21:43 -0500 Subject: [PATCH 6/7] linting --- unstructured_inference/models/chipper.py | 2 +- unstructured_inference/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py index bdb4ea85..38c57170 100644 --- a/unstructured_inference/models/chipper.py +++ b/unstructured_inference/models/chipper.py @@ -21,7 +21,7 @@ from unstructured_inference.models.unstructuredmodel import ( UnstructuredElementExtractionModel, ) -from unstructured_inference.utils import LazyDict, strip_tags, download_if_needed_and_get_local_path +from unstructured_inference.utils import LazyDict, download_if_needed_and_get_local_path, strip_tags MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = { "chipperv1": { diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index 447ae047..8371e423 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -5,9 +5,9 @@ from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union import cv2 +from huggingface_hub import hf_hub_download import numpy as np from PIL import Image -from huggingface_hub import hf_hub_download from unstructured_inference.constants import AnnotationResult from unstructured_inference.inference.layoutelement import LayoutElement From 9b5ebd22e16482efedcd81568b9fb4d6cfae501f Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 6 May 2024 21:25:47 -0500 Subject: [PATCH 7/7] LINTING --- unstructured_inference/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index 8371e423..c8ea035e 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union import cv2 -from huggingface_hub import hf_hub_download import numpy as np +from huggingface_hub import hf_hub_download from PIL import Image from unstructured_inference.constants import AnnotationResult