Skip to content

Commit

Permalink
enhancement: use local files (#343)
Browse files Browse the repository at this point in the history
Add the ability to specify a local file for models.
  • Loading branch information
qued authored May 7, 2024
1 parent 55d35d4 commit d006776
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 21 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.7.31-dev0
## 0.7.31-dev1

* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text
* enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF

## 0.7.30

Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.31-dev0" # pragma: no cover
__version__ = "0.7.31-dev1" # pragma: no cover
7 changes: 3 additions & 4 deletions unstructured_inference/models/chipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import torch
import transformers
from cv2.typing import MatLike
from huggingface_hub import hf_hub_download
from PIL.Image import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
from transformers.generation.logits_process import LogitsProcessor
Expand All @@ -22,7 +21,7 @@
from unstructured_inference.models.unstructuredmodel import (
UnstructuredElementExtractionModel,
)
from unstructured_inference.utils import LazyDict, strip_tags
from unstructured_inference.utils import LazyDict, download_if_needed_and_get_local_path, strip_tags

MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {
"chipperv1": {
Expand Down Expand Up @@ -115,8 +114,8 @@ def initialize(
token=auth_token,
)
if swap_head:
lm_head_file = hf_hub_download(
repo_id=pre_trained_model_repo,
lm_head_file = download_if_needed_and_get_local_path(
path_or_repo=pre_trained_model_repo,
filename="lm_head.pth",
token=auth_token,
)
Expand Down
15 changes: 9 additions & 6 deletions unstructured_inference/models/detectron2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path
from typing import Any, Dict, Final, List, Optional, Union

from huggingface_hub import hf_hub_download
from layoutparser.models.detectron2.layoutmodel import (
Detectron2LayoutModel,
is_detectron2_available,
Expand All @@ -17,7 +16,11 @@
from unstructured_inference.models.unstructuredmodel import (
UnstructuredObjectDetectionModel,
)
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from unstructured_inference.utils import (
LazyDict,
LazyEvaluateInfo,
download_if_needed_and_get_local_path,
)

DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
Expand All @@ -35,12 +38,12 @@
MODEL_TYPES = {
"detectron2_lp": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"layoutparser/detectron2",
"PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth",
),
config_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"layoutparser/detectron2",
"PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml",
),
Expand All @@ -49,12 +52,12 @@
),
"checkbox": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/oer-checkbox",
"detectron2_finetuned_oer_checkbox.pth",
),
config_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/oer-checkbox",
"detectron2_oer_checkbox.json",
),
Expand Down
11 changes: 7 additions & 4 deletions unstructured_inference/models/detectron2onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import cv2
import numpy as np
import onnxruntime
from huggingface_hub import hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from onnxruntime.capi import _pybind_state as C
from onnxruntime.quantization import QuantType, quantize_dynamic
Expand All @@ -16,7 +15,11 @@
from unstructured_inference.models.unstructuredmodel import (
UnstructuredObjectDetectionModel,
)
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from unstructured_inference.utils import (
LazyDict,
LazyEvaluateInfo,
download_if_needed_and_get_local_path,
)

onnxruntime.set_default_logger_severity(logger_onnx.getEffectiveLevel())

Expand All @@ -34,7 +37,7 @@
MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {
"detectron2_onnx": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/detectron2_faster_rcnn_R_50_FPN_3x",
"model.onnx",
),
Expand All @@ -52,7 +55,7 @@
},
"detectron2_mask_rcnn": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/detectron2_mask_rcnn_X_101_32x8d_FPN_3x",
"model.onnx",
),
Expand Down
13 changes: 8 additions & 5 deletions unstructured_inference/models/yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@
import cv2
import numpy as np
import onnxruntime
from huggingface_hub import hf_hub_download
from onnxruntime.capi import _pybind_state as C
from PIL import Image as PILImage

from unstructured_inference.constants import ElementType, Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from unstructured_inference.utils import (
LazyDict,
LazyEvaluateInfo,
download_if_needed_and_get_local_path,
)

YOLOX_LABEL_MAP = {
0: ElementType.CAPTION,
Expand All @@ -34,23 +37,23 @@
MODEL_TYPES = {
"yolox": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/yolo_x_layout",
"yolox_l0.05.onnx",
),
label_map=YOLOX_LABEL_MAP,
),
"yolox_tiny": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/yolo_x_layout",
"yolox_tiny.onnx",
),
label_map=YOLOX_LABEL_MAP,
),
"yolox_quantized": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/yolo_x_layout",
"yolox_l0.05_quantized.onnx",
),
Expand Down
11 changes: 11 additions & 0 deletions unstructured_inference/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import cv2
import numpy as np
from huggingface_hub import hf_hub_download
from PIL import Image

from unstructured_inference.constants import AnnotationResult
Expand Down Expand Up @@ -182,3 +183,13 @@ def strip_tags(html: str) -> str:
s = MLStripper()
s.feed(html)
return s.get_data()


def download_if_needed_and_get_local_path(path_or_repo: str, filename: str, **kwargs) -> str:
"""Returns path to local file if it exists, otherwise treats it as a huggingface repo and
attempts to download."""
full_path = os.path.join(path_or_repo, filename)
if os.path.exists(full_path):
return full_path
else:
return hf_hub_download(path_or_repo, filename, **kwargs)

0 comments on commit d006776

Please sign in to comment.