diff --git a/scispacy/file_cache.py b/scispacy/file_cache.py index d737dab..9ff9918 100644 --- a/scispacy/file_cache.py +++ b/scispacy/file_cache.py @@ -12,6 +12,7 @@ from hashlib import sha256 import requests +from tqdm import tqdm CACHE_ROOT = Path(os.getenv("SCISPACY_CACHE", str(Path.home() / ".scispacy"))) DATASET_CACHE = str(CACHE_ROOT / "datasets") @@ -96,9 +97,13 @@ def filename_to_url(filename: str, cache_dir: Optional[str] = None) -> Tuple[str def http_get(url: str, temp_file: IO) -> None: req = requests.get(url, stream=True) + total = int(req.headers.get("content-length", 0)) + pbar = tqdm(total=total, unit="iB", unit_scale=True, unit_divisor=1024) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks - temp_file.write(chunk) + size = temp_file.write(chunk) + pbar.update(size) + pbar.close() def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str: diff --git a/setup.py b/setup.py index c43dca7..e1d2002 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ setup( name="scispacy", version=VERSION["VERSION"], - url="https://allenai.github.io/SciSpaCy/", + url="https://allenai.github.io/scispacy/", author="Allen Institute for Artificial Intelligence", author_email="ai2-info@allenai.org", description="A full SpaCy pipeline and models for scientific/biomedical documents.",