Skip to content

Commit

Permalink
Bpemb hot fix (#229)
Browse files Browse the repository at this point in the history
* hot-fix the problem with BPEmb broken base URL

* add changelog
  • Loading branch information
davebulaval authored Jun 23, 2024
1 parent c28f01c commit 166bf28
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -345,4 +345,5 @@
- Fix app errors.
- Add data validation for 1) multiple consecutive whitespace and 2) newline.
- Fixes some errors in tests.
- Add an argument to the `DatasetContainer` interface to use a pre-processing data cleaning function before validation.
- Add an argument to the `DatasetContainer` interface to use a pre-processing data cleaning function before validation.
- Hot-fix the issue with BPEmb base URL download problem see [issue 221](https://github.com/GRAAL-Research/deepparse/issues/221).
11 changes: 9 additions & 2 deletions deepparse/embeddings_models/bpemb_embeddings_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@

import requests
from bpemb import BPEmb

from numpy.core.multiarray import ndarray
from urllib3.exceptions import InsecureRequestWarning

from .embeddings_model import EmbeddingsModel


class BPEmbBaseURLWrapperBugFix(BPEmb):
def __init__(self, **kwargs):
self.base_url = "https://bpemb.h-its.org/multi/"
super().__init__(**kwargs)


class BPEmbEmbeddingsModel(EmbeddingsModel):
"""
BPEmb embeddings network from `BPEmb: Tokenization-free Pre-trained Subword Embeddings in 275 Languages
Expand All @@ -31,7 +36,9 @@ def __init__(self, cache_dir: str, verbose: bool = True) -> None:
# hotfix until https://github.com/bheinzerling/bpemb/issues/63
# is resolved.
with no_ssl_verification():
model = BPEmb(lang="multi", vs=100000, dim=300, cache_dir=Path(cache_dir)) # defaults parameters
# We use the default parameters other than the dim at 300 and a vs of 100,000
# We use a BPEmb wrapper since the base URL is broken and the issue is not resolved as of june 23rd.
model = BPEmbBaseURLWrapperBugFix(lang="multi", vs=100000, dim=300, cache_dir=Path(cache_dir))
self.model = model

def __call__(self, word: str) -> ndarray:
Expand Down

0 comments on commit 166bf28

Please sign in to comment.