Skip to content

Commit

Permalink
feat: add file source resolution utility
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Sep 18, 2024
1 parent f273fc8 commit cf3f5a4
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
54 changes: 54 additions & 0 deletions docling_core/utils/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

"""File-related utilities."""

import tempfile
from pathlib import Path
from typing import Union

import requests
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError


def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
"""Resolves the source (URL, path) of a file to a local file path.
If a URL is provided, the content is first downloaded to a temporary local file.
Args:
source (Path | AnyHttpUrl | str): The file input source. Can be a path or URL.
Raises:
ValueError: If source is of unexpected type.
Returns:
Path: The local file path.
"""
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path or "file").name
local_path = Path(tempfile.mkdtemp()) / fname
with open(local_path, "wb") as f:
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
except ValidationError:
raise ValueError(f"Unexpected source type encountered: {type(source)}")
return local_path
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ module = [
"jsonschema.*",
"json_schema_for_humans.*",
"pandas.*",
"requests.*",
"tabulate.*",
]
ignore_missing_imports = true
Expand Down

0 comments on commit cf3f5a4

Please sign in to comment.