Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhancement: use local files #343

Merged
merged 7 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.7.31-dev0
## 0.7.31-dev1

* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text
* enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF

## 0.7.30

Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.31-dev0" # pragma: no cover
__version__ = "0.7.31-dev1" # pragma: no cover
7 changes: 3 additions & 4 deletions unstructured_inference/models/chipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import torch
import transformers
from cv2.typing import MatLike
from huggingface_hub import hf_hub_download
from PIL.Image import Image
from transformers import DonutProcessor, VisionEncoderDecoderModel
from transformers.generation.logits_process import LogitsProcessor
Expand All @@ -22,7 +21,7 @@
from unstructured_inference.models.unstructuredmodel import (
UnstructuredElementExtractionModel,
)
from unstructured_inference.utils import LazyDict, strip_tags
from unstructured_inference.utils import LazyDict, download_if_needed_and_get_local_path, strip_tags

MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {
"chipperv1": {
Expand Down Expand Up @@ -115,8 +114,8 @@ def initialize(
token=auth_token,
)
if swap_head:
lm_head_file = hf_hub_download(
repo_id=pre_trained_model_repo,
lm_head_file = download_if_needed_and_get_local_path(
path_or_repo=pre_trained_model_repo,
filename="lm_head.pth",
token=auth_token,
)
Expand Down
15 changes: 9 additions & 6 deletions unstructured_inference/models/detectron2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path
from typing import Any, Dict, Final, List, Optional, Union

from huggingface_hub import hf_hub_download
from layoutparser.models.detectron2.layoutmodel import (
Detectron2LayoutModel,
is_detectron2_available,
Expand All @@ -17,7 +16,11 @@
from unstructured_inference.models.unstructuredmodel import (
UnstructuredObjectDetectionModel,
)
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from unstructured_inference.utils import (
LazyDict,
LazyEvaluateInfo,
download_if_needed_and_get_local_path,
)

DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
Expand All @@ -35,12 +38,12 @@
MODEL_TYPES = {
"detectron2_lp": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"layoutparser/detectron2",
"PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth",
),
config_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"layoutparser/detectron2",
"PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml",
),
Expand All @@ -49,12 +52,12 @@
),
"checkbox": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/oer-checkbox",
"detectron2_finetuned_oer_checkbox.pth",
),
config_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/oer-checkbox",
"detectron2_oer_checkbox.json",
),
Expand Down
11 changes: 7 additions & 4 deletions unstructured_inference/models/detectron2onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import cv2
import numpy as np
import onnxruntime
from huggingface_hub import hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from onnxruntime.capi import _pybind_state as C
from onnxruntime.quantization import QuantType, quantize_dynamic
Expand All @@ -16,7 +15,11 @@
from unstructured_inference.models.unstructuredmodel import (
UnstructuredObjectDetectionModel,
)
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from unstructured_inference.utils import (
LazyDict,
LazyEvaluateInfo,
download_if_needed_and_get_local_path,
)

onnxruntime.set_default_logger_severity(logger_onnx.getEffectiveLevel())

Expand All @@ -34,7 +37,7 @@
MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {
"detectron2_onnx": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/detectron2_faster_rcnn_R_50_FPN_3x",
"model.onnx",
),
Expand All @@ -52,7 +55,7 @@
},
"detectron2_mask_rcnn": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/detectron2_mask_rcnn_X_101_32x8d_FPN_3x",
"model.onnx",
),
Expand Down
13 changes: 8 additions & 5 deletions unstructured_inference/models/yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@
import cv2
import numpy as np
import onnxruntime
from huggingface_hub import hf_hub_download
from onnxruntime.capi import _pybind_state as C
from PIL import Image as PILImage

from unstructured_inference.constants import ElementType, Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
from unstructured_inference.utils import (
LazyDict,
LazyEvaluateInfo,
download_if_needed_and_get_local_path,
)

YOLOX_LABEL_MAP = {
0: ElementType.CAPTION,
Expand All @@ -34,23 +37,23 @@
MODEL_TYPES = {
"yolox": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/yolo_x_layout",
"yolox_l0.05.onnx",
),
label_map=YOLOX_LABEL_MAP,
),
"yolox_tiny": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/yolo_x_layout",
"yolox_tiny.onnx",
),
label_map=YOLOX_LABEL_MAP,
),
"yolox_quantized": LazyDict(
model_path=LazyEvaluateInfo(
hf_hub_download,
download_if_needed_and_get_local_path,
"unstructuredio/yolo_x_layout",
"yolox_l0.05_quantized.onnx",
),
Expand Down
11 changes: 11 additions & 0 deletions unstructured_inference/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import cv2
import numpy as np
from huggingface_hub import hf_hub_download
from PIL import Image

from unstructured_inference.constants import AnnotationResult
Expand Down Expand Up @@ -182,3 +183,13 @@ def strip_tags(html: str) -> str:
s = MLStripper()
s.feed(html)
return s.get_data()


def download_if_needed_and_get_local_path(path_or_repo: str, filename: str, **kwargs) -> str:
"""Returns path to local file if it exists, otherwise treats it as a huggingface repo and
attempts to download."""
full_path = os.path.join(path_or_repo, filename)
if os.path.exists(full_path):
return full_path
else:
return hf_hub_download(path_or_repo, filename, **kwargs)
Loading